[Sync] Sync patch from openeuler/gcc

This commit is contained in:
wangding16 2024-04-24 12:44:41 +08:00
parent 25bccb60b0
commit 2707dd6474
6 changed files with 956 additions and 0 deletions

View File

@ -0,0 +1,220 @@
From cfffbec938afdc45c31db5ec282ce21ad1ba2dc7 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Thu, 10 Aug 2023 11:41:39 +0800
Subject: [PATCH 11/32] Software mitigation: Disable gather generation in
vectorization for GDS affected Intel Processors.
For more details of GDS (Gather Data Sampling), refer to
https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/gather-data-sampling.html
After microcode update, there's performance regression. To avoid that,
the patch disables gather generation in autovectorization but uses
gather scalar emulation instead.
gcc/ChangeLog:
* config/i386/i386-options.cc (m_GDS): New macro.
* config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Don't
enable for m_GDS.
(X86_TUNE_USE_GATHER_4PARTS): Ditto.
(X86_TUNE_USE_GATHER): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/avx2-gather-2.c: Adjust options to keep
gather vectorization.
* gcc.target/i386/avx2-gather-6.c: Ditto.
* gcc.target/i386/avx512f-pr88464-1.c: Ditto.
* gcc.target/i386/avx512f-pr88464-5.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-1.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-11.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-3.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-9.c: Ditto.
* gcc.target/i386/pr88531-1b.c: Ditto.
* gcc.target/i386/pr88531-1c.c: Ditto.
(cherry picked from commit 3064d1f5c48cb6ce1b4133570dd08ecca8abb52d)
---
gcc/config/i386/i386-options.cc | 5 +++++
gcc/config/i386/x86-tune.def | 9 ++++++---
gcc/testsuite/gcc.target/i386/avx2-gather-2.c | 2 +-
gcc/testsuite/gcc.target/i386/avx2-gather-6.c | 2 +-
gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c | 2 +-
gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c | 2 +-
gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c | 2 +-
gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c | 2 +-
gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c | 2 +-
gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c | 2 +-
gcc/testsuite/gcc.target/i386/pr88531-1b.c | 2 +-
gcc/testsuite/gcc.target/i386/pr88531-1c.c | 2 +-
12 files changed, 21 insertions(+), 13 deletions(-)
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index fb2ed942f..9617fc162 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -137,6 +137,11 @@ along with GCC; see the file COPYING3. If not see
#define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS)
#define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT)
#define m_INTEL (HOST_WIDE_INT_1U<<PROCESSOR_INTEL)
+/* Gather Data Sampling / CVE-2022-40982 / INTEL-SA-00828.
+ Software mitigation. */
+#define m_GDS (m_SKYLAKE | m_SKYLAKE_AVX512 | m_CANNONLAKE \
+ | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_CASCADELAKE \
+ | m_TIGERLAKE | m_COOPERLAKE | m_ROCKETLAKE)
#define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE)
#define m_K6 (HOST_WIDE_INT_1U<<PROCESSOR_K6)
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index e6b9e2125..4392709fc 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -467,7 +467,8 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
/* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
elements. */
DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE | m_GENERIC))
+ ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE
+ | m_GENERIC | m_GDS))
/* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
elements. */
@@ -477,7 +478,8 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
/* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
elements. */
DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE | m_GENERIC))
+ ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE
+ | m_GENERIC | m_GDS))
/* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
elements. */
@@ -487,7 +489,8 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
/* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
elements. */
DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE | m_GENERIC))
+ ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE
+ | m_GENERIC | m_GDS))
/* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
elements. */
diff --git a/gcc/testsuite/gcc.target/i386/avx2-gather-2.c b/gcc/testsuite/gcc.target/i386/avx2-gather-2.c
index ad5ef7310..978924b0f 100644
--- a/gcc/testsuite/gcc.target/i386/avx2-gather-2.c
+++ b/gcc/testsuite/gcc.target/i386/avx2-gather-2.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O3 -fdump-tree-vect-details -march=skylake" } */
+/* { dg-options "-O3 -fdump-tree-vect-details -march=skylake -mtune=haswell" } */
#include "avx2-gather-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/avx2-gather-6.c b/gcc/testsuite/gcc.target/i386/avx2-gather-6.c
index b9119581a..067b251e3 100644
--- a/gcc/testsuite/gcc.target/i386/avx2-gather-6.c
+++ b/gcc/testsuite/gcc.target/i386/avx2-gather-6.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O3 -mavx2 -fno-common -fdump-tree-vect-details -mtune=skylake" } */
+/* { dg-options "-O3 -mavx2 -fno-common -fdump-tree-vect-details -mtune=haswell" } */
#include "avx2-gather-5.c"
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c
index 06d21bb01..d1a229861 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c
@@ -1,6 +1,6 @@
/* PR tree-optimization/88464 */
/* { dg-do compile } */
-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=haswell -fdump-tree-vect-details" } */
/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 4 "vect" } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c
index 462e951fd..d7b0b2b28 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c
@@ -1,6 +1,6 @@
/* PR tree-optimization/88464 */
/* { dg-do compile } */
-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=haswell -fdump-tree-vect-details" } */
/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 4 "vect" } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c
index 55a28dddb..07439185e 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c
@@ -1,6 +1,6 @@
/* PR tree-optimization/88464 */
/* { dg-do compile } */
-/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
+/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=haswell -fdump-tree-vect-details" } */
/* { dg-final { scan-tree-dump-times "loop vectorized using 32 byte vectors" 4 "vect" } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c
index 969600885..3a9810827 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c
@@ -1,6 +1,6 @@
/* PR tree-optimization/88464 */
/* { dg-do compile } */
-/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
+/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=haswell -fdump-tree-vect-details" } */
/* { dg-final { scan-tree-dump-times "loop vectorized using 16 byte vectors" 4 "vect" } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c
index 6b0c8a859..ac669e048 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c
@@ -1,6 +1,6 @@
/* PR tree-optimization/88464 */
/* { dg-do compile } */
-/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
+/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=haswell -fdump-tree-vect-details" } */
/* { dg-final { scan-tree-dump-times "loop vectorized using 16 byte vectors" 4 "vect" } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c
index 3af568ab3..14a1083b6 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c
@@ -1,6 +1,6 @@
/* PR tree-optimization/88464 */
/* { dg-do compile } */
-/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
+/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=haswell -fdump-tree-vect-details" } */
/* { dg-final { scan-tree-dump-times "loop vectorized using 32 byte vectors" 4 "vect" } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr88531-1b.c b/gcc/testsuite/gcc.target/i386/pr88531-1b.c
index 812c8a10f..e6df789de 100644
--- a/gcc/testsuite/gcc.target/i386/pr88531-1b.c
+++ b/gcc/testsuite/gcc.target/i386/pr88531-1b.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O3 -march=skylake -mfpmath=sse" } */
+/* { dg-options "-O3 -march=skylake -mfpmath=sse -mtune=haswell" } */
#include "pr88531-1a.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr88531-1c.c b/gcc/testsuite/gcc.target/i386/pr88531-1c.c
index 43fc5913e..a093c87c0 100644
--- a/gcc/testsuite/gcc.target/i386/pr88531-1c.c
+++ b/gcc/testsuite/gcc.target/i386/pr88531-1c.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O3 -march=skylake-avx512 -mfpmath=sse" } */
+/* { dg-options "-O3 -march=skylake-avx512 -mfpmath=sse -mtune=haswell" } */
#include "pr88531-1a.c"
--
2.28.0.windows.1

View File

@ -0,0 +1,187 @@
From c269629130cb23252da2db026ce9ed13f57f69f4 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Thu, 10 Aug 2023 16:26:13 +0800
Subject: [PATCH 12/32] Support -m[no-]gather -m[no-]scatter to enable/disable
vectorization for all gather/scatter instructions
Rename original use_gather to use_gather_8parts, Support
-mtune-ctrl={,^}use_gather to set/clear tune features
use_gather_{2parts, 4parts, 8parts}. Support the new option -mgather
as alias of -mtune-ctrl=, use_gather, ^use_gather.
Similar for use_scatter.
gcc/ChangeLog:
* config/i386/i386-builtins.cc
(ix86_vectorize_builtin_gather): Adjust for use_gather_8parts.
* config/i386/i386-options.cc (parse_mtune_ctrl_str):
Set/Clear tune features use_{gather,scatter}_{2parts, 4parts,
8parts} for -mtune-crtl={,^}{use_gather,use_scatter}.
* config/i386/i386.cc (ix86_vectorize_builtin_scatter): Adjust
for use_scatter_8parts
* config/i386/i386.h (TARGET_USE_GATHER): Rename to ..
(TARGET_USE_GATHER_8PARTS): .. this.
(TARGET_USE_SCATTER): Rename to ..
(TARGET_USE_SCATTER_8PARTS): .. this.
* config/i386/x86-tune.def (X86_TUNE_USE_GATHER): Rename to
(X86_TUNE_USE_GATHER_8PARTS): .. this.
(X86_TUNE_USE_SCATTER): Rename to
(X86_TUNE_USE_SCATTER_8PARTS): .. this.
* config/i386/i386.opt: Add new options mgather, mscatter.
(cherry picked from commit b2a927fb5343db363ea4361da0d6bcee227b6737)
---
gcc/config/i386/i386-builtins.cc | 2 +-
gcc/config/i386/i386-options.cc | 54 +++++++++++++++++++++++---------
gcc/config/i386/i386.cc | 2 +-
gcc/config/i386/i386.h | 8 ++---
gcc/config/i386/i386.opt | 4 +++
gcc/config/i386/x86-tune.def | 4 +--
6 files changed, 52 insertions(+), 22 deletions(-)
diff --git a/gcc/config/i386/i386-builtins.cc b/gcc/config/i386/i386-builtins.cc
index 050c6228a..8ed32e14f 100644
--- a/gcc/config/i386/i386-builtins.cc
+++ b/gcc/config/i386/i386-builtins.cc
@@ -1790,7 +1790,7 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype,
? !TARGET_USE_GATHER_2PARTS
: (known_eq (TYPE_VECTOR_SUBPARTS (mem_vectype), 4u)
? !TARGET_USE_GATHER_4PARTS
- : !TARGET_USE_GATHER)))
+ : !TARGET_USE_GATHER_8PARTS)))
return NULL_TREE;
if ((TREE_CODE (index_type) != INTEGER_TYPE
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 9617fc162..3df1f0c41 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -1705,20 +1705,46 @@ parse_mtune_ctrl_str (struct gcc_options *opts, bool dump)
curr_feature_string++;
clear = true;
}
- for (i = 0; i < X86_TUNE_LAST; i++)
- {
- if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
- {
- ix86_tune_features[i] = !clear;
- if (dump)
- fprintf (stderr, "Explicitly %s feature %s\n",
- clear ? "clear" : "set", ix86_tune_feature_names[i]);
- break;
- }
- }
- if (i == X86_TUNE_LAST)
- error ("unknown parameter to option %<-mtune-ctrl%>: %s",
- clear ? curr_feature_string - 1 : curr_feature_string);
+
+ if (!strcmp (curr_feature_string, "use_gather"))
+ {
+ ix86_tune_features[X86_TUNE_USE_GATHER_2PARTS] = !clear;
+ ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS] = !clear;
+ ix86_tune_features[X86_TUNE_USE_GATHER_8PARTS] = !clear;
+ if (dump)
+ fprintf (stderr, "Explicitly %s features use_gather_2parts,"
+ " use_gather_4parts, use_gather_8parts\n",
+ clear ? "clear" : "set");
+
+ }
+ else if (!strcmp (curr_feature_string, "use_scatter"))
+ {
+ ix86_tune_features[X86_TUNE_USE_SCATTER_2PARTS] = !clear;
+ ix86_tune_features[X86_TUNE_USE_SCATTER_4PARTS] = !clear;
+ ix86_tune_features[X86_TUNE_USE_SCATTER_8PARTS] = !clear;
+ if (dump)
+ fprintf (stderr, "Explicitly %s features use_scatter_2parts,"
+ " use_scatter_4parts, use_scatter_8parts\n",
+ clear ? "clear" : "set");
+ }
+ else
+ {
+ for (i = 0; i < X86_TUNE_LAST; i++)
+ {
+ if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
+ {
+ ix86_tune_features[i] = !clear;
+ if (dump)
+ fprintf (stderr, "Explicitly %s feature %s\n",
+ clear ? "clear" : "set", ix86_tune_feature_names[i]);
+ break;
+ }
+ }
+
+ if (i == X86_TUNE_LAST)
+ error ("unknown parameter to option %<-mtune-ctrl%>: %s",
+ clear ? curr_feature_string - 1 : curr_feature_string);
+ }
curr_feature_string = next_feature_string;
}
while (curr_feature_string);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 479fc6010..e75d37023 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -18937,7 +18937,7 @@ ix86_vectorize_builtin_scatter (const_tree vectype,
? !TARGET_USE_SCATTER_2PARTS
: (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
? !TARGET_USE_SCATTER_4PARTS
- : !TARGET_USE_SCATTER))
+ : !TARGET_USE_SCATTER_8PARTS))
return NULL_TREE;
if ((TREE_CODE (index_type) != INTEGER_TYPE
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 688aaabd3..aaa136ba0 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -403,10 +403,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS]
#define TARGET_USE_SCATTER_4PARTS \
ix86_tune_features[X86_TUNE_USE_SCATTER_4PARTS]
-#define TARGET_USE_GATHER \
- ix86_tune_features[X86_TUNE_USE_GATHER]
-#define TARGET_USE_SCATTER \
- ix86_tune_features[X86_TUNE_USE_SCATTER]
+#define TARGET_USE_GATHER_8PARTS \
+ ix86_tune_features[X86_TUNE_USE_GATHER_8PARTS]
+#define TARGET_USE_SCATTER_8PARTS \
+ ix86_tune_features[X86_TUNE_USE_SCATTER_8PARTS]
#define TARGET_FUSE_CMP_AND_BRANCH_32 \
ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32]
#define TARGET_FUSE_CMP_AND_BRANCH_64 \
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 498fb454d..b154110d8 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1222,3 +1222,7 @@ Instructions number above which STFL stall penalty can be compensated.
munroll-only-small-loops
Target Var(ix86_unroll_only_small_loops) Init(0) Save
Enable conservative small loop unrolling.
+
+mscatter
+Target Alias(mtune-ctrl=, use_scatter, ^use_scatter)
+Enable vectorization for scatter instruction.
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 4392709fc..bdb455d20 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -488,13 +488,13 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
/* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
elements. */
-DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
+DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE
| m_GENERIC | m_GDS))
/* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
elements. */
-DEF_TUNE (X86_TUNE_USE_SCATTER, "use_scatter",
+DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
~(m_ZNVER4))
/* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
--
2.28.0.windows.1

View File

@ -0,0 +1,129 @@
From 764518a35e90a3e13c469275da9c3c7002fe1982 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Fri, 8 Sep 2023 09:22:43 +0800
Subject: [PATCH 13/32] Remove constraint modifier % for
fcmaddcph/fmaddcph/fcmulcph since there're not commutative.
gcc/ChangeLog:
PR target/111306
PR target/111335
* config/i386/sse.md (int_comm): New int_attr.
(fma_<complexopname>_<mode><sdc_maskz_name><round_name>):
Remove % for Complex conjugate operations since they're not
commutative.
(fma_<complexpairopname>_<mode>_pair): Ditto.
(<avx512>_<complexopname>_<mode>_mask<round_name>): Ditto.
(cmul<conj_op><mode>3): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr111306.c: New test.
(cherry picked from commit f197392a16ffb1327f1d12ff8ff05f9295e015cb)
---
gcc/config/i386/sse.md | 16 ++++++++---
gcc/testsuite/gcc.target/i386/pr111306.c | 36 ++++++++++++++++++++++++
2 files changed, 48 insertions(+), 4 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr111306.c
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 3af159896..f25dd5f2b 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -6318,6 +6318,14 @@
[(UNSPEC_COMPLEX_FMA_PAIR "fmaddc")
(UNSPEC_COMPLEX_FCMA_PAIR "fcmaddc")])
+(define_int_attr int_comm
+ [(UNSPEC_COMPLEX_FMA "")
+ (UNSPEC_COMPLEX_FMA_PAIR "")
+ (UNSPEC_COMPLEX_FCMA "")
+ (UNSPEC_COMPLEX_FCMA_PAIR "")
+ (UNSPEC_COMPLEX_FMUL "%")
+ (UNSPEC_COMPLEX_FCMUL "")])
+
(define_int_attr conj_op
[(UNSPEC_COMPLEX_FMA "")
(UNSPEC_COMPLEX_FCMA "_conj")
@@ -6431,7 +6439,7 @@
(define_insn "fma_<complexopname>_<mode><sdc_maskz_name><round_name>"
[(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
(unspec:VF_AVX512FP16VL
- [(match_operand:VF_AVX512FP16VL 1 "<round_nimm_predicate>" "%v")
+ [(match_operand:VF_AVX512FP16VL 1 "<round_nimm_predicate>" "<int_comm>v")
(match_operand:VF_AVX512FP16VL 2 "<round_nimm_predicate>" "<round_constraint>")
(match_operand:VF_AVX512FP16VL 3 "<round_nimm_predicate>" "0")]
UNSPEC_COMPLEX_F_C_MA))]
@@ -6495,7 +6503,7 @@
(define_insn "fma_<complexpairopname>_<mode>_pair"
[(set (match_operand:VF1_AVX512VL 0 "register_operand" "=&v")
(unspec:VF1_AVX512VL
- [(match_operand:VF1_AVX512VL 1 "vector_operand" "%v")
+ [(match_operand:VF1_AVX512VL 1 "vector_operand" "<int_comm>v")
(match_operand:VF1_AVX512VL 2 "bcst_vector_operand" "vmBr")
(match_operand:VF1_AVX512VL 3 "vector_operand" "0")]
UNSPEC_COMPLEX_F_C_MA_PAIR))]
@@ -6562,7 +6570,7 @@
[(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
(vec_merge:VF_AVX512FP16VL
(unspec:VF_AVX512FP16VL
- [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v")
+ [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "<int_comm>v")
(match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")
(match_operand:VF_AVX512FP16VL 3 "register_operand" "0")]
UNSPEC_COMPLEX_F_C_MA)
@@ -6586,7 +6594,7 @@
(define_insn "<avx512>_<complexopname>_<mode><maskc_name><round_name>"
[(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
(unspec:VF_AVX512FP16VL
- [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v")
+ [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "<int_comm>v")
(match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")]
UNSPEC_COMPLEX_F_C_MUL))]
"TARGET_AVX512FP16 && <round_mode512bit_condition>"
diff --git a/gcc/testsuite/gcc.target/i386/pr111306.c b/gcc/testsuite/gcc.target/i386/pr111306.c
new file mode 100644
index 000000000..541725ebd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr111306.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+/* { dg-require-effective-target avx512fp16 } */
+
+#define AVX512FP16
+#include "avx512f-helper.h"
+
+__attribute__((optimize("O2"),noipa))
+void func1(_Float16 *a, _Float16 *b, int n, _Float16 *c) {
+ __m512h rA = _mm512_loadu_ph(a);
+ for (int i = 0; i < n; i += 32) {
+ __m512h rB = _mm512_loadu_ph(b + i);
+ _mm512_storeu_ph(c + i, _mm512_fcmul_pch(rB, rA));
+ }
+}
+
+void
+test_512 (void)
+{
+ int n = 32;
+ _Float16 a[n], b[n], c[n];
+ _Float16 exp[n];
+ for (int i = 1; i <= n; i++) {
+ a[i - 1] = i & 1 ? -i : i;
+ b[i - 1] = i;
+ }
+
+ func1(a, b, n, c);
+ for (int i = 0; i < n / 32; i += 2) {
+ if (c[i] != a[i] * b[i] + a[i+1] * b[i+1]
+ || c[i+1] != a[i] * b[i+1] - a[i+1]*b[i])
+ __builtin_abort ();
+ }
+}
+
+
--
2.28.0.windows.1

View File

@ -0,0 +1,106 @@
From afd539adfe762adb57863299a11987b7e20e7987 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 5 Jul 2023 13:45:11 +0800
Subject: [PATCH 14/32] Disparage slightly for the alternative which move
DFmode between SSE_REGS and GENERAL_REGS.
For testcase
void __cond_swap(double* __x, double* __y) {
bool __r = (*__x < *__y);
auto __tmp = __r ? *__x : *__y;
*__y = __r ? *__y : *__x;
*__x = __tmp;
}
GCC-14 with -O2 and -march=x86-64 options generates the following code:
__cond_swap(double*, double*):
movsd xmm1, QWORD PTR [rdi]
movsd xmm0, QWORD PTR [rsi]
comisd xmm0, xmm1
jbe .L2
movq rax, xmm1
movapd xmm1, xmm0
movq xmm0, rax
.L2:
movsd QWORD PTR [rsi], xmm1
movsd QWORD PTR [rdi], xmm0
ret
rax is used to save and restore DFmode value. In RA both GENERAL_REGS
and SSE_REGS cost zero since we didn't disparage the
alternative in movdf_internal pattern, according to register
allocation order, GENERAL_REGS is allocated. The patch add ? for
alternative (r,v) and (v,r) just like we did for movsf/hf/bf_internal
pattern, after that we get optimal RA.
__cond_swap:
.LFB0:
.cfi_startproc
movsd (%rdi), %xmm1
movsd (%rsi), %xmm0
comisd %xmm1, %xmm0
jbe .L2
movapd %xmm1, %xmm2
movapd %xmm0, %xmm1
movapd %xmm2, %xmm0
.L2:
movsd %xmm1, (%rsi)
movsd %xmm0, (%rdi)
ret
gcc/ChangeLog:
PR target/110170
* config/i386/i386.md (movdf_internal): Disparage slightly for
2 alternatives (r,v) and (v,r) by adding constraint modifier
'?'.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr110170-3.c: New test.
(cherry picked from commit 37a231cc7594d12ba0822077018aad751a6fb94e)
---
gcc/config/i386/i386.md | 4 ++--
gcc/testsuite/gcc.target/i386/pr110170-3.c | 11 +++++++++++
2 files changed, 13 insertions(+), 2 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr110170-3.c
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index be07be10d..71691f598 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -3582,9 +3582,9 @@
;; Possible store forwarding (partial memory) stall in alternatives 4, 6 and 7.
(define_insn "*movdf_internal"
[(set (match_operand:DF 0 "nonimmediate_operand"
- "=Yf*f,m ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,r ,v,r ,o ,r ,m")
+ "=Yf*f,m ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,?r,?v,r ,o ,r ,m")
(match_operand:DF 1 "general_operand"
- "Yf*fm,Yf*f,G ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x,v,r ,roF,rF,rmF,rC"))]
+ "Yf*fm,Yf*f,G ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x, v, r,roF,rF,rmF,rC"))]
"!(MEM_P (operands[0]) && MEM_P (operands[1]))
&& (lra_in_progress || reload_completed
|| !CONST_DOUBLE_P (operands[1])
diff --git a/gcc/testsuite/gcc.target/i386/pr110170-3.c b/gcc/testsuite/gcc.target/i386/pr110170-3.c
new file mode 100644
index 000000000..70daa89e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110170-3.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -fno-if-conversion -fno-if-conversion2" } */
+/* { dg-final { scan-assembler-not {(?n)movq.*r} } } */
+
+void __cond_swap(double* __x, double* __y) {
+ _Bool __r = (*__x < *__y);
+ double __tmp = __r ? *__x : *__y;
+ *__y = __r ? *__y : *__x;
+ *__x = __tmp;
+}
+
--
2.28.0.windows.1

View File

@ -0,0 +1,163 @@
From 88516507757932c1e67ce99d240596935971d2d0 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Thu, 9 Nov 2023 13:20:05 +0800
Subject: [PATCH 15/32] Fix wrong code due to vec_merge + pcmp to blendvb
splitter.
gcc/ChangeLog:
PR target/112443
* config/i386/sse.md (*avx2_pcmp<mode>3_4): Fix swap condition
from LT to GT since there's not in the pattern.
(*avx2_pcmp<mode>3_5): Ditto.
gcc/testsuite/ChangeLog:
* g++.target/i386/pr112443.C: New test.
(cherry picked from commit 9a0cc04b9c9b02426762892b88efc5c44ba546bd)
---
gcc/config/i386/sse.md | 4 +-
gcc/testsuite/g++.target/i386/pr112443.C | 108 +++++++++++++++++++++++
2 files changed, 110 insertions(+), 2 deletions(-)
create mode 100644 gcc/testsuite/g++.target/i386/pr112443.C
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index f25dd5f2b..23b858ab2 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -16358,7 +16358,7 @@
(match_dup 4))]
UNSPEC_BLENDV))]
{
- if (INTVAL (operands[5]) == 1)
+ if (INTVAL (operands[5]) == 5)
std::swap (operands[1], operands[2]);
operands[3] = gen_lowpart (<MODE>mode, operands[3]);
})
@@ -16388,7 +16388,7 @@
(match_dup 4))]
UNSPEC_BLENDV))]
{
- if (INTVAL (operands[5]) == 1)
+ if (INTVAL (operands[5]) == 5)
std::swap (operands[1], operands[2]);
})
diff --git a/gcc/testsuite/g++.target/i386/pr112443.C b/gcc/testsuite/g++.target/i386/pr112443.C
new file mode 100644
index 000000000..ebfa9b4a7
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr112443.C
@@ -0,0 +1,108 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-options "-O2 -std=c++17 -mavx512bw -mavx512vl" } */
+
+#include <cstdint>
+#include <x86intrin.h>
+#include <functional>
+#include <ostream>
+
+#define AVX512BW
+#define AVX512VL
+
+#include "avx512f-helper.h"
+
+struct TensorIteratorBase{
+ char* in;
+ char* out;
+
+ void for_each(std::function<void(char*, char*, int64_t size)> loop){
+ loop(out, in, 32);
+ }
+};
+
+class Vectorized {
+protected:
+ __m256i values;
+
+ static inline __m256i invert(const __m256i& v) {
+ const auto ones = _mm256_set1_epi64x(-1);
+ return _mm256_xor_si256(ones, v);
+ }
+public:
+ operator __m256i() const {
+ return values;
+ }
+
+ static constexpr int size() {
+ return 32;
+ }
+
+ Vectorized() {}
+ Vectorized(__m256i v) : values(v) {}
+ Vectorized(uint8_t v) { values = _mm256_set1_epi8(v); }
+ static Vectorized blendv(const Vectorized& a, const Vectorized& b,
+ const Vectorized& mask) {
+ return _mm256_blendv_epi8(a, b, mask);
+ }
+ static Vectorized loadu(const void* ptr) {
+ return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+ }
+ void store(void* ptr) const {
+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
+ }
+
+ Vectorized operator<(const Vectorized& other) const {
+ __m256i max = _mm256_max_epu8(values, other);
+ return invert(_mm256_cmpeq_epi8(max, values));
+ }
+ Vectorized operator-(const Vectorized& b) {
+ return _mm256_sub_epi8(values, b);
+ }
+};
+
+std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) {
+ uint8_t buf[Vectorized::size()];
+ vec.store(buf);
+ stream << "vec[";
+ for (int i = 0; i != Vectorized::size(); i++) {
+ if (i != 0)
+ stream << ", ";
+ stream << buf[i]*1;
+ }
+ stream << "]";
+ return stream;
+}
+
+void run(TensorIteratorBase iter){
+ Vectorized zero_vec(0);
+ Vectorized one_vec(1);
+
+ iter.for_each([=](char* out, char* in, int64_t size) {
+ for (int64_t i = 0; i <= size - Vectorized::size(); i += Vectorized::size()) {
+ auto self_vec = Vectorized::loadu(in + i);
+ auto left = Vectorized::blendv(zero_vec, one_vec, zero_vec < self_vec);
+ auto right = Vectorized::blendv(zero_vec, one_vec, self_vec < zero_vec);
+ auto outv = left - right;
+ outv.store(out + i);
+ }
+ });
+}
+
+void
+test_256 (){
+ char in[32];
+ char out[32];
+ for(auto& x: in) x = 1;
+ run(TensorIteratorBase{in, out});
+ Vectorized::loadu (out);
+ for (int i = 0; i != 32; i++)
+ if (out[i] != 1)
+ __builtin_abort ();
+}
+
+void
+test_128 ()
+{
+}
--
2.28.0.windows.1

View File

@ -0,0 +1,151 @@
From 204ffa7f503411ccac0161c951726274648b6374 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Thu, 7 Dec 2023 09:17:27 +0800
Subject: [PATCH 16/32] Don't assume it's AVX_U128_CLEAN after call_insn whose
abi.mode_clobber(V4DImode) deosn't contains all SSE_REGS.
If the function desn't clobber any sse registers or only clobber
128-bit part, then vzeroupper isn't issued before the function exit.
the status not CLEAN but ANY after the function.
Also for sibling_call, it's safe to issue an vzeroupper. Also there
could be missing vzeroupper since there's no mode_exit for
sibling_call_p.
gcc/ChangeLog:
PR target/112891
* config/i386/i386.cc (ix86_avx_u128_mode_after): Return
AVX_U128_ANY if callee_abi doesn't clobber all_sse_regs to
align with ix86_avx_u128_mode_needed.
(ix86_avx_u128_mode_needed): Return AVX_U128_ClEAN for
sibling_call.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr112891.c: New test.
* gcc.target/i386/pr112891-2.c: New test.
(cherry picked from commit fc189a08f5b7ad5889bd4c6b320c1dd99dd5d642)
---
gcc/config/i386/i386.cc | 22 +++++++++++++---
gcc/testsuite/gcc.target/i386/pr112891-2.c | 30 ++++++++++++++++++++++
gcc/testsuite/gcc.target/i386/pr112891.c | 29 +++++++++++++++++++++
3 files changed, 78 insertions(+), 3 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr112891-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr112891.c
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index e75d37023..60f3296b0 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -14416,8 +14416,12 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
modes wider than 256 bits. It's only safe to issue a
vzeroupper if all SSE registers are clobbered. */
const function_abi &abi = insn_callee_abi (insn);
- if (!hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
- abi.mode_clobbers (V4DImode)))
+ /* Should be safe to issue an vzeroupper before sibling_call_p.
+ Also there not mode_exit for sibling_call, so there could be
+ missing vzeroupper for that. */
+ if (!(SIBLING_CALL_P (insn)
+ || hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
+ abi.mode_clobbers (V4DImode))))
return AVX_U128_ANY;
return AVX_U128_CLEAN;
@@ -14555,7 +14559,19 @@ ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
bool avx_upper_reg_found = false;
note_stores (insn, ix86_check_avx_upper_stores, &avx_upper_reg_found);
- return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
+ if (avx_upper_reg_found)
+ return AVX_U128_DIRTY;
+
+ /* If the function desn't clobber any sse registers or only clobber
+ 128-bit part, Then vzeroupper isn't issued before the function exit.
+ the status not CLEAN but ANY after the function. */
+ const function_abi &abi = insn_callee_abi (insn);
+ if (!(SIBLING_CALL_P (insn)
+ || hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
+ abi.mode_clobbers (V4DImode))))
+ return AVX_U128_ANY;
+
+ return AVX_U128_CLEAN;
}
/* Otherwise, return current mode. Remember that if insn
diff --git a/gcc/testsuite/gcc.target/i386/pr112891-2.c b/gcc/testsuite/gcc.target/i386/pr112891-2.c
new file mode 100644
index 000000000..164c3985d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr112891-2.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O3" } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
+
+void
+__attribute__((noinline))
+bar (double* a)
+{
+ a[0] = 1.0;
+ a[1] = 2.0;
+}
+
+double
+__attribute__((noinline))
+foo (double* __restrict a, double* b)
+{
+ a[0] += b[0];
+ a[1] += b[1];
+ a[2] += b[2];
+ a[3] += b[3];
+ bar (b);
+ return a[5] + b[5];
+}
+
+double
+foo1 (double* __restrict a, double* b)
+{
+ double c = foo (a, b);
+ return __builtin_exp (c);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr112891.c b/gcc/testsuite/gcc.target/i386/pr112891.c
new file mode 100644
index 000000000..dbf6c6794
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr112891.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O3" } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
+
+void
+__attribute__((noinline))
+bar (double* a)
+{
+ a[0] = 1.0;
+ a[1] = 2.0;
+}
+
+void
+__attribute__((noinline))
+foo (double* __restrict a, double* b)
+{
+ a[0] += b[0];
+ a[1] += b[1];
+ a[2] += b[2];
+ a[3] += b[3];
+ bar (b);
+}
+
+double
+foo1 (double* __restrict a, double* b)
+{
+ foo (a, b);
+ return __builtin_exp (b[1]);
+}
--
2.28.0.windows.1