[Sync] Sync patch from openeuler/gcc
This commit is contained in:
parent
25bccb60b0
commit
2707dd6474
220
0066-Software-mitigation-Disable-gather-generation-in-vec.patch
Normal file
220
0066-Software-mitigation-Disable-gather-generation-in-vec.patch
Normal file
@ -0,0 +1,220 @@
|
||||
From cfffbec938afdc45c31db5ec282ce21ad1ba2dc7 Mon Sep 17 00:00:00 2001
|
||||
From: liuhongt <hongtao.liu@intel.com>
|
||||
Date: Thu, 10 Aug 2023 11:41:39 +0800
|
||||
Subject: [PATCH 11/32] Software mitigation: Disable gather generation in
|
||||
vectorization for GDS affected Intel Processors.
|
||||
|
||||
For more details of GDS (Gather Data Sampling), refer to
|
||||
https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/gather-data-sampling.html
|
||||
|
||||
After microcode update, there's performance regression. To avoid that,
|
||||
the patch disables gather generation in autovectorization but uses
|
||||
gather scalar emulation instead.
|
||||
|
||||
gcc/ChangeLog:
|
||||
|
||||
* config/i386/i386-options.cc (m_GDS): New macro.
|
||||
* config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Don't
|
||||
enable for m_GDS.
|
||||
(X86_TUNE_USE_GATHER_4PARTS): Ditto.
|
||||
(X86_TUNE_USE_GATHER): Ditto.
|
||||
|
||||
gcc/testsuite/ChangeLog:
|
||||
|
||||
* gcc.target/i386/avx2-gather-2.c: Adjust options to keep
|
||||
gather vectorization.
|
||||
* gcc.target/i386/avx2-gather-6.c: Ditto.
|
||||
* gcc.target/i386/avx512f-pr88464-1.c: Ditto.
|
||||
* gcc.target/i386/avx512f-pr88464-5.c: Ditto.
|
||||
* gcc.target/i386/avx512vl-pr88464-1.c: Ditto.
|
||||
* gcc.target/i386/avx512vl-pr88464-11.c: Ditto.
|
||||
* gcc.target/i386/avx512vl-pr88464-3.c: Ditto.
|
||||
* gcc.target/i386/avx512vl-pr88464-9.c: Ditto.
|
||||
* gcc.target/i386/pr88531-1b.c: Ditto.
|
||||
* gcc.target/i386/pr88531-1c.c: Ditto.
|
||||
|
||||
(cherry picked from commit 3064d1f5c48cb6ce1b4133570dd08ecca8abb52d)
|
||||
---
|
||||
gcc/config/i386/i386-options.cc | 5 +++++
|
||||
gcc/config/i386/x86-tune.def | 9 ++++++---
|
||||
gcc/testsuite/gcc.target/i386/avx2-gather-2.c | 2 +-
|
||||
gcc/testsuite/gcc.target/i386/avx2-gather-6.c | 2 +-
|
||||
gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c | 2 +-
|
||||
gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c | 2 +-
|
||||
gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c | 2 +-
|
||||
gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c | 2 +-
|
||||
gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c | 2 +-
|
||||
gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c | 2 +-
|
||||
gcc/testsuite/gcc.target/i386/pr88531-1b.c | 2 +-
|
||||
gcc/testsuite/gcc.target/i386/pr88531-1c.c | 2 +-
|
||||
12 files changed, 21 insertions(+), 13 deletions(-)
|
||||
|
||||
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
|
||||
index fb2ed942f..9617fc162 100644
|
||||
--- a/gcc/config/i386/i386-options.cc
|
||||
+++ b/gcc/config/i386/i386-options.cc
|
||||
@@ -137,6 +137,11 @@ along with GCC; see the file COPYING3. If not see
|
||||
#define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS)
|
||||
#define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT)
|
||||
#define m_INTEL (HOST_WIDE_INT_1U<<PROCESSOR_INTEL)
|
||||
+/* Gather Data Sampling / CVE-2022-40982 / INTEL-SA-00828.
|
||||
+ Software mitigation. */
|
||||
+#define m_GDS (m_SKYLAKE | m_SKYLAKE_AVX512 | m_CANNONLAKE \
|
||||
+ | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_CASCADELAKE \
|
||||
+ | m_TIGERLAKE | m_COOPERLAKE | m_ROCKETLAKE)
|
||||
|
||||
#define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE)
|
||||
#define m_K6 (HOST_WIDE_INT_1U<<PROCESSOR_K6)
|
||||
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
|
||||
index e6b9e2125..4392709fc 100644
|
||||
--- a/gcc/config/i386/x86-tune.def
|
||||
+++ b/gcc/config/i386/x86-tune.def
|
||||
@@ -467,7 +467,8 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
|
||||
/* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
|
||||
elements. */
|
||||
DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
|
||||
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE | m_GENERIC))
|
||||
+ ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE
|
||||
+ | m_GENERIC | m_GDS))
|
||||
|
||||
/* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
|
||||
elements. */
|
||||
@@ -477,7 +478,8 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
|
||||
/* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
|
||||
elements. */
|
||||
DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
|
||||
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE | m_GENERIC))
|
||||
+ ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE
|
||||
+ | m_GENERIC | m_GDS))
|
||||
|
||||
/* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
|
||||
elements. */
|
||||
@@ -487,7 +489,8 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
|
||||
/* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
|
||||
elements. */
|
||||
DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
|
||||
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE | m_GENERIC))
|
||||
+ ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE
|
||||
+ | m_GENERIC | m_GDS))
|
||||
|
||||
/* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
|
||||
elements. */
|
||||
diff --git a/gcc/testsuite/gcc.target/i386/avx2-gather-2.c b/gcc/testsuite/gcc.target/i386/avx2-gather-2.c
|
||||
index ad5ef7310..978924b0f 100644
|
||||
--- a/gcc/testsuite/gcc.target/i386/avx2-gather-2.c
|
||||
+++ b/gcc/testsuite/gcc.target/i386/avx2-gather-2.c
|
||||
@@ -1,5 +1,5 @@
|
||||
/* { dg-do compile } */
|
||||
-/* { dg-options "-O3 -fdump-tree-vect-details -march=skylake" } */
|
||||
+/* { dg-options "-O3 -fdump-tree-vect-details -march=skylake -mtune=haswell" } */
|
||||
|
||||
#include "avx2-gather-1.c"
|
||||
|
||||
diff --git a/gcc/testsuite/gcc.target/i386/avx2-gather-6.c b/gcc/testsuite/gcc.target/i386/avx2-gather-6.c
|
||||
index b9119581a..067b251e3 100644
|
||||
--- a/gcc/testsuite/gcc.target/i386/avx2-gather-6.c
|
||||
+++ b/gcc/testsuite/gcc.target/i386/avx2-gather-6.c
|
||||
@@ -1,5 +1,5 @@
|
||||
/* { dg-do compile } */
|
||||
-/* { dg-options "-O3 -mavx2 -fno-common -fdump-tree-vect-details -mtune=skylake" } */
|
||||
+/* { dg-options "-O3 -mavx2 -fno-common -fdump-tree-vect-details -mtune=haswell" } */
|
||||
|
||||
#include "avx2-gather-5.c"
|
||||
|
||||
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c
|
||||
index 06d21bb01..d1a229861 100644
|
||||
--- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c
|
||||
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c
|
||||
@@ -1,6 +1,6 @@
|
||||
/* PR tree-optimization/88464 */
|
||||
/* { dg-do compile } */
|
||||
-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
|
||||
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=haswell -fdump-tree-vect-details" } */
|
||||
/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 4 "vect" } } */
|
||||
/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
|
||||
|
||||
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c
|
||||
index 462e951fd..d7b0b2b28 100644
|
||||
--- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c
|
||||
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c
|
||||
@@ -1,6 +1,6 @@
|
||||
/* PR tree-optimization/88464 */
|
||||
/* { dg-do compile } */
|
||||
-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
|
||||
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=haswell -fdump-tree-vect-details" } */
|
||||
/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 4 "vect" } } */
|
||||
/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
|
||||
|
||||
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c
|
||||
index 55a28dddb..07439185e 100644
|
||||
--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c
|
||||
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c
|
||||
@@ -1,6 +1,6 @@
|
||||
/* PR tree-optimization/88464 */
|
||||
/* { dg-do compile } */
|
||||
-/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
|
||||
+/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=haswell -fdump-tree-vect-details" } */
|
||||
/* { dg-final { scan-tree-dump-times "loop vectorized using 32 byte vectors" 4 "vect" } } */
|
||||
/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
|
||||
|
||||
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c
|
||||
index 969600885..3a9810827 100644
|
||||
--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c
|
||||
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c
|
||||
@@ -1,6 +1,6 @@
|
||||
/* PR tree-optimization/88464 */
|
||||
/* { dg-do compile } */
|
||||
-/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
|
||||
+/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=haswell -fdump-tree-vect-details" } */
|
||||
/* { dg-final { scan-tree-dump-times "loop vectorized using 16 byte vectors" 4 "vect" } } */
|
||||
/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
|
||||
|
||||
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c
|
||||
index 6b0c8a859..ac669e048 100644
|
||||
--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c
|
||||
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c
|
||||
@@ -1,6 +1,6 @@
|
||||
/* PR tree-optimization/88464 */
|
||||
/* { dg-do compile } */
|
||||
-/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
|
||||
+/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=haswell -fdump-tree-vect-details" } */
|
||||
/* { dg-final { scan-tree-dump-times "loop vectorized using 16 byte vectors" 4 "vect" } } */
|
||||
/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
|
||||
|
||||
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c
|
||||
index 3af568ab3..14a1083b6 100644
|
||||
--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c
|
||||
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c
|
||||
@@ -1,6 +1,6 @@
|
||||
/* PR tree-optimization/88464 */
|
||||
/* { dg-do compile } */
|
||||
-/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
|
||||
+/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=haswell -fdump-tree-vect-details" } */
|
||||
/* { dg-final { scan-tree-dump-times "loop vectorized using 32 byte vectors" 4 "vect" } } */
|
||||
/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
|
||||
|
||||
diff --git a/gcc/testsuite/gcc.target/i386/pr88531-1b.c b/gcc/testsuite/gcc.target/i386/pr88531-1b.c
|
||||
index 812c8a10f..e6df789de 100644
|
||||
--- a/gcc/testsuite/gcc.target/i386/pr88531-1b.c
|
||||
+++ b/gcc/testsuite/gcc.target/i386/pr88531-1b.c
|
||||
@@ -1,5 +1,5 @@
|
||||
/* { dg-do compile } */
|
||||
-/* { dg-options "-O3 -march=skylake -mfpmath=sse" } */
|
||||
+/* { dg-options "-O3 -march=skylake -mfpmath=sse -mtune=haswell" } */
|
||||
|
||||
#include "pr88531-1a.c"
|
||||
|
||||
diff --git a/gcc/testsuite/gcc.target/i386/pr88531-1c.c b/gcc/testsuite/gcc.target/i386/pr88531-1c.c
|
||||
index 43fc5913e..a093c87c0 100644
|
||||
--- a/gcc/testsuite/gcc.target/i386/pr88531-1c.c
|
||||
+++ b/gcc/testsuite/gcc.target/i386/pr88531-1c.c
|
||||
@@ -1,5 +1,5 @@
|
||||
/* { dg-do compile } */
|
||||
-/* { dg-options "-O3 -march=skylake-avx512 -mfpmath=sse" } */
|
||||
+/* { dg-options "-O3 -march=skylake-avx512 -mfpmath=sse -mtune=haswell" } */
|
||||
|
||||
#include "pr88531-1a.c"
|
||||
|
||||
--
|
||||
2.28.0.windows.1
|
||||
|
||||
187
0067-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch
Normal file
187
0067-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch
Normal file
@ -0,0 +1,187 @@
|
||||
From c269629130cb23252da2db026ce9ed13f57f69f4 Mon Sep 17 00:00:00 2001
|
||||
From: liuhongt <hongtao.liu@intel.com>
|
||||
Date: Thu, 10 Aug 2023 16:26:13 +0800
|
||||
Subject: [PATCH 12/32] Support -m[no-]gather -m[no-]scatter to enable/disable
|
||||
vectorization for all gather/scatter instructions
|
||||
|
||||
Rename original use_gather to use_gather_8parts, Support
|
||||
-mtune-ctrl={,^}use_gather to set/clear tune features
|
||||
use_gather_{2parts, 4parts, 8parts}. Support the new option -mgather
|
||||
as alias of -mtune-ctrl=, use_gather, ^use_gather.
|
||||
|
||||
Similar for use_scatter.
|
||||
|
||||
gcc/ChangeLog:
|
||||
|
||||
* config/i386/i386-builtins.cc
|
||||
(ix86_vectorize_builtin_gather): Adjust for use_gather_8parts.
|
||||
* config/i386/i386-options.cc (parse_mtune_ctrl_str):
|
||||
Set/Clear tune features use_{gather,scatter}_{2parts, 4parts,
|
||||
8parts} for -mtune-crtl={,^}{use_gather,use_scatter}.
|
||||
* config/i386/i386.cc (ix86_vectorize_builtin_scatter): Adjust
|
||||
for use_scatter_8parts
|
||||
* config/i386/i386.h (TARGET_USE_GATHER): Rename to ..
|
||||
(TARGET_USE_GATHER_8PARTS): .. this.
|
||||
(TARGET_USE_SCATTER): Rename to ..
|
||||
(TARGET_USE_SCATTER_8PARTS): .. this.
|
||||
* config/i386/x86-tune.def (X86_TUNE_USE_GATHER): Rename to
|
||||
(X86_TUNE_USE_GATHER_8PARTS): .. this.
|
||||
(X86_TUNE_USE_SCATTER): Rename to
|
||||
(X86_TUNE_USE_SCATTER_8PARTS): .. this.
|
||||
* config/i386/i386.opt: Add new options mgather, mscatter.
|
||||
|
||||
(cherry picked from commit b2a927fb5343db363ea4361da0d6bcee227b6737)
|
||||
---
|
||||
gcc/config/i386/i386-builtins.cc | 2 +-
|
||||
gcc/config/i386/i386-options.cc | 54 +++++++++++++++++++++++---------
|
||||
gcc/config/i386/i386.cc | 2 +-
|
||||
gcc/config/i386/i386.h | 8 ++---
|
||||
gcc/config/i386/i386.opt | 4 +++
|
||||
gcc/config/i386/x86-tune.def | 4 +--
|
||||
6 files changed, 52 insertions(+), 22 deletions(-)
|
||||
|
||||
diff --git a/gcc/config/i386/i386-builtins.cc b/gcc/config/i386/i386-builtins.cc
|
||||
index 050c6228a..8ed32e14f 100644
|
||||
--- a/gcc/config/i386/i386-builtins.cc
|
||||
+++ b/gcc/config/i386/i386-builtins.cc
|
||||
@@ -1790,7 +1790,7 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype,
|
||||
? !TARGET_USE_GATHER_2PARTS
|
||||
: (known_eq (TYPE_VECTOR_SUBPARTS (mem_vectype), 4u)
|
||||
? !TARGET_USE_GATHER_4PARTS
|
||||
- : !TARGET_USE_GATHER)))
|
||||
+ : !TARGET_USE_GATHER_8PARTS)))
|
||||
return NULL_TREE;
|
||||
|
||||
if ((TREE_CODE (index_type) != INTEGER_TYPE
|
||||
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
|
||||
index 9617fc162..3df1f0c41 100644
|
||||
--- a/gcc/config/i386/i386-options.cc
|
||||
+++ b/gcc/config/i386/i386-options.cc
|
||||
@@ -1705,20 +1705,46 @@ parse_mtune_ctrl_str (struct gcc_options *opts, bool dump)
|
||||
curr_feature_string++;
|
||||
clear = true;
|
||||
}
|
||||
- for (i = 0; i < X86_TUNE_LAST; i++)
|
||||
- {
|
||||
- if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
|
||||
- {
|
||||
- ix86_tune_features[i] = !clear;
|
||||
- if (dump)
|
||||
- fprintf (stderr, "Explicitly %s feature %s\n",
|
||||
- clear ? "clear" : "set", ix86_tune_feature_names[i]);
|
||||
- break;
|
||||
- }
|
||||
- }
|
||||
- if (i == X86_TUNE_LAST)
|
||||
- error ("unknown parameter to option %<-mtune-ctrl%>: %s",
|
||||
- clear ? curr_feature_string - 1 : curr_feature_string);
|
||||
+
|
||||
+ if (!strcmp (curr_feature_string, "use_gather"))
|
||||
+ {
|
||||
+ ix86_tune_features[X86_TUNE_USE_GATHER_2PARTS] = !clear;
|
||||
+ ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS] = !clear;
|
||||
+ ix86_tune_features[X86_TUNE_USE_GATHER_8PARTS] = !clear;
|
||||
+ if (dump)
|
||||
+ fprintf (stderr, "Explicitly %s features use_gather_2parts,"
|
||||
+ " use_gather_4parts, use_gather_8parts\n",
|
||||
+ clear ? "clear" : "set");
|
||||
+
|
||||
+ }
|
||||
+ else if (!strcmp (curr_feature_string, "use_scatter"))
|
||||
+ {
|
||||
+ ix86_tune_features[X86_TUNE_USE_SCATTER_2PARTS] = !clear;
|
||||
+ ix86_tune_features[X86_TUNE_USE_SCATTER_4PARTS] = !clear;
|
||||
+ ix86_tune_features[X86_TUNE_USE_SCATTER_8PARTS] = !clear;
|
||||
+ if (dump)
|
||||
+ fprintf (stderr, "Explicitly %s features use_scatter_2parts,"
|
||||
+ " use_scatter_4parts, use_scatter_8parts\n",
|
||||
+ clear ? "clear" : "set");
|
||||
+ }
|
||||
+ else
|
||||
+ {
|
||||
+ for (i = 0; i < X86_TUNE_LAST; i++)
|
||||
+ {
|
||||
+ if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
|
||||
+ {
|
||||
+ ix86_tune_features[i] = !clear;
|
||||
+ if (dump)
|
||||
+ fprintf (stderr, "Explicitly %s feature %s\n",
|
||||
+ clear ? "clear" : "set", ix86_tune_feature_names[i]);
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (i == X86_TUNE_LAST)
|
||||
+ error ("unknown parameter to option %<-mtune-ctrl%>: %s",
|
||||
+ clear ? curr_feature_string - 1 : curr_feature_string);
|
||||
+ }
|
||||
curr_feature_string = next_feature_string;
|
||||
}
|
||||
while (curr_feature_string);
|
||||
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
|
||||
index 479fc6010..e75d37023 100644
|
||||
--- a/gcc/config/i386/i386.cc
|
||||
+++ b/gcc/config/i386/i386.cc
|
||||
@@ -18937,7 +18937,7 @@ ix86_vectorize_builtin_scatter (const_tree vectype,
|
||||
? !TARGET_USE_SCATTER_2PARTS
|
||||
: (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
|
||||
? !TARGET_USE_SCATTER_4PARTS
|
||||
- : !TARGET_USE_SCATTER))
|
||||
+ : !TARGET_USE_SCATTER_8PARTS))
|
||||
return NULL_TREE;
|
||||
|
||||
if ((TREE_CODE (index_type) != INTEGER_TYPE
|
||||
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
|
||||
index 688aaabd3..aaa136ba0 100644
|
||||
--- a/gcc/config/i386/i386.h
|
||||
+++ b/gcc/config/i386/i386.h
|
||||
@@ -403,10 +403,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
|
||||
ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS]
|
||||
#define TARGET_USE_SCATTER_4PARTS \
|
||||
ix86_tune_features[X86_TUNE_USE_SCATTER_4PARTS]
|
||||
-#define TARGET_USE_GATHER \
|
||||
- ix86_tune_features[X86_TUNE_USE_GATHER]
|
||||
-#define TARGET_USE_SCATTER \
|
||||
- ix86_tune_features[X86_TUNE_USE_SCATTER]
|
||||
+#define TARGET_USE_GATHER_8PARTS \
|
||||
+ ix86_tune_features[X86_TUNE_USE_GATHER_8PARTS]
|
||||
+#define TARGET_USE_SCATTER_8PARTS \
|
||||
+ ix86_tune_features[X86_TUNE_USE_SCATTER_8PARTS]
|
||||
#define TARGET_FUSE_CMP_AND_BRANCH_32 \
|
||||
ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32]
|
||||
#define TARGET_FUSE_CMP_AND_BRANCH_64 \
|
||||
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
|
||||
index 498fb454d..b154110d8 100644
|
||||
--- a/gcc/config/i386/i386.opt
|
||||
+++ b/gcc/config/i386/i386.opt
|
||||
@@ -1222,3 +1222,7 @@ Instructions number above which STFL stall penalty can be compensated.
|
||||
munroll-only-small-loops
|
||||
Target Var(ix86_unroll_only_small_loops) Init(0) Save
|
||||
Enable conservative small loop unrolling.
|
||||
+
|
||||
+mscatter
|
||||
+Target Alias(mtune-ctrl=, use_scatter, ^use_scatter)
|
||||
+Enable vectorization for scatter instruction.
|
||||
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
|
||||
index 4392709fc..bdb455d20 100644
|
||||
--- a/gcc/config/i386/x86-tune.def
|
||||
+++ b/gcc/config/i386/x86-tune.def
|
||||
@@ -488,13 +488,13 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
|
||||
|
||||
/* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
|
||||
elements. */
|
||||
-DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
|
||||
+DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
|
||||
~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE
|
||||
| m_GENERIC | m_GDS))
|
||||
|
||||
/* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
|
||||
elements. */
|
||||
-DEF_TUNE (X86_TUNE_USE_SCATTER, "use_scatter",
|
||||
+DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
|
||||
~(m_ZNVER4))
|
||||
|
||||
/* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
|
||||
--
|
||||
2.28.0.windows.1
|
||||
|
||||
129
0068-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch
Normal file
129
0068-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch
Normal file
@ -0,0 +1,129 @@
|
||||
From 764518a35e90a3e13c469275da9c3c7002fe1982 Mon Sep 17 00:00:00 2001
|
||||
From: liuhongt <hongtao.liu@intel.com>
|
||||
Date: Fri, 8 Sep 2023 09:22:43 +0800
|
||||
Subject: [PATCH 13/32] Remove constraint modifier % for
|
||||
fcmaddcph/fmaddcph/fcmulcph since there're not commutative.
|
||||
|
||||
gcc/ChangeLog:
|
||||
|
||||
PR target/111306
|
||||
PR target/111335
|
||||
* config/i386/sse.md (int_comm): New int_attr.
|
||||
(fma_<complexopname>_<mode><sdc_maskz_name><round_name>):
|
||||
Remove % for Complex conjugate operations since they're not
|
||||
commutative.
|
||||
(fma_<complexpairopname>_<mode>_pair): Ditto.
|
||||
(<avx512>_<complexopname>_<mode>_mask<round_name>): Ditto.
|
||||
(cmul<conj_op><mode>3): Ditto.
|
||||
|
||||
gcc/testsuite/ChangeLog:
|
||||
|
||||
* gcc.target/i386/pr111306.c: New test.
|
||||
|
||||
(cherry picked from commit f197392a16ffb1327f1d12ff8ff05f9295e015cb)
|
||||
---
|
||||
gcc/config/i386/sse.md | 16 ++++++++---
|
||||
gcc/testsuite/gcc.target/i386/pr111306.c | 36 ++++++++++++++++++++++++
|
||||
2 files changed, 48 insertions(+), 4 deletions(-)
|
||||
create mode 100644 gcc/testsuite/gcc.target/i386/pr111306.c
|
||||
|
||||
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
|
||||
index 3af159896..f25dd5f2b 100644
|
||||
--- a/gcc/config/i386/sse.md
|
||||
+++ b/gcc/config/i386/sse.md
|
||||
@@ -6318,6 +6318,14 @@
|
||||
[(UNSPEC_COMPLEX_FMA_PAIR "fmaddc")
|
||||
(UNSPEC_COMPLEX_FCMA_PAIR "fcmaddc")])
|
||||
|
||||
+(define_int_attr int_comm
|
||||
+ [(UNSPEC_COMPLEX_FMA "")
|
||||
+ (UNSPEC_COMPLEX_FMA_PAIR "")
|
||||
+ (UNSPEC_COMPLEX_FCMA "")
|
||||
+ (UNSPEC_COMPLEX_FCMA_PAIR "")
|
||||
+ (UNSPEC_COMPLEX_FMUL "%")
|
||||
+ (UNSPEC_COMPLEX_FCMUL "")])
|
||||
+
|
||||
(define_int_attr conj_op
|
||||
[(UNSPEC_COMPLEX_FMA "")
|
||||
(UNSPEC_COMPLEX_FCMA "_conj")
|
||||
@@ -6431,7 +6439,7 @@
|
||||
(define_insn "fma_<complexopname>_<mode><sdc_maskz_name><round_name>"
|
||||
[(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
|
||||
(unspec:VF_AVX512FP16VL
|
||||
- [(match_operand:VF_AVX512FP16VL 1 "<round_nimm_predicate>" "%v")
|
||||
+ [(match_operand:VF_AVX512FP16VL 1 "<round_nimm_predicate>" "<int_comm>v")
|
||||
(match_operand:VF_AVX512FP16VL 2 "<round_nimm_predicate>" "<round_constraint>")
|
||||
(match_operand:VF_AVX512FP16VL 3 "<round_nimm_predicate>" "0")]
|
||||
UNSPEC_COMPLEX_F_C_MA))]
|
||||
@@ -6495,7 +6503,7 @@
|
||||
(define_insn "fma_<complexpairopname>_<mode>_pair"
|
||||
[(set (match_operand:VF1_AVX512VL 0 "register_operand" "=&v")
|
||||
(unspec:VF1_AVX512VL
|
||||
- [(match_operand:VF1_AVX512VL 1 "vector_operand" "%v")
|
||||
+ [(match_operand:VF1_AVX512VL 1 "vector_operand" "<int_comm>v")
|
||||
(match_operand:VF1_AVX512VL 2 "bcst_vector_operand" "vmBr")
|
||||
(match_operand:VF1_AVX512VL 3 "vector_operand" "0")]
|
||||
UNSPEC_COMPLEX_F_C_MA_PAIR))]
|
||||
@@ -6562,7 +6570,7 @@
|
||||
[(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
|
||||
(vec_merge:VF_AVX512FP16VL
|
||||
(unspec:VF_AVX512FP16VL
|
||||
- [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v")
|
||||
+ [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "<int_comm>v")
|
||||
(match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")
|
||||
(match_operand:VF_AVX512FP16VL 3 "register_operand" "0")]
|
||||
UNSPEC_COMPLEX_F_C_MA)
|
||||
@@ -6586,7 +6594,7 @@
|
||||
(define_insn "<avx512>_<complexopname>_<mode><maskc_name><round_name>"
|
||||
[(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
|
||||
(unspec:VF_AVX512FP16VL
|
||||
- [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v")
|
||||
+ [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "<int_comm>v")
|
||||
(match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")]
|
||||
UNSPEC_COMPLEX_F_C_MUL))]
|
||||
"TARGET_AVX512FP16 && <round_mode512bit_condition>"
|
||||
diff --git a/gcc/testsuite/gcc.target/i386/pr111306.c b/gcc/testsuite/gcc.target/i386/pr111306.c
|
||||
new file mode 100644
|
||||
index 000000000..541725ebd
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.target/i386/pr111306.c
|
||||
@@ -0,0 +1,36 @@
|
||||
+/* { dg-do run } */
|
||||
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
|
||||
+/* { dg-require-effective-target avx512fp16 } */
|
||||
+
|
||||
+#define AVX512FP16
|
||||
+#include "avx512f-helper.h"
|
||||
+
|
||||
+__attribute__((optimize("O2"),noipa))
|
||||
+void func1(_Float16 *a, _Float16 *b, int n, _Float16 *c) {
|
||||
+ __m512h rA = _mm512_loadu_ph(a);
|
||||
+ for (int i = 0; i < n; i += 32) {
|
||||
+ __m512h rB = _mm512_loadu_ph(b + i);
|
||||
+ _mm512_storeu_ph(c + i, _mm512_fcmul_pch(rB, rA));
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+void
|
||||
+test_512 (void)
|
||||
+{
|
||||
+ int n = 32;
|
||||
+ _Float16 a[n], b[n], c[n];
|
||||
+ _Float16 exp[n];
|
||||
+ for (int i = 1; i <= n; i++) {
|
||||
+ a[i - 1] = i & 1 ? -i : i;
|
||||
+ b[i - 1] = i;
|
||||
+ }
|
||||
+
|
||||
+ func1(a, b, n, c);
|
||||
+ for (int i = 0; i < n / 32; i += 2) {
|
||||
+ if (c[i] != a[i] * b[i] + a[i+1] * b[i+1]
|
||||
+ || c[i+1] != a[i] * b[i+1] - a[i+1]*b[i])
|
||||
+ __builtin_abort ();
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+
|
||||
--
|
||||
2.28.0.windows.1
|
||||
|
||||
106
0069-Disparage-slightly-for-the-alternative-which-move-DF.patch
Normal file
106
0069-Disparage-slightly-for-the-alternative-which-move-DF.patch
Normal file
@ -0,0 +1,106 @@
|
||||
From afd539adfe762adb57863299a11987b7e20e7987 Mon Sep 17 00:00:00 2001
|
||||
From: liuhongt <hongtao.liu@intel.com>
|
||||
Date: Wed, 5 Jul 2023 13:45:11 +0800
|
||||
Subject: [PATCH 14/32] Disparage slightly for the alternative which move
|
||||
DFmode between SSE_REGS and GENERAL_REGS.
|
||||
|
||||
For testcase
|
||||
|
||||
void __cond_swap(double* __x, double* __y) {
|
||||
bool __r = (*__x < *__y);
|
||||
auto __tmp = __r ? *__x : *__y;
|
||||
*__y = __r ? *__y : *__x;
|
||||
*__x = __tmp;
|
||||
}
|
||||
|
||||
GCC-14 with -O2 and -march=x86-64 options generates the following code:
|
||||
|
||||
__cond_swap(double*, double*):
|
||||
movsd xmm1, QWORD PTR [rdi]
|
||||
movsd xmm0, QWORD PTR [rsi]
|
||||
comisd xmm0, xmm1
|
||||
jbe .L2
|
||||
movq rax, xmm1
|
||||
movapd xmm1, xmm0
|
||||
movq xmm0, rax
|
||||
.L2:
|
||||
movsd QWORD PTR [rsi], xmm1
|
||||
movsd QWORD PTR [rdi], xmm0
|
||||
ret
|
||||
|
||||
rax is used to save and restore DFmode value. In RA both GENERAL_REGS
|
||||
and SSE_REGS cost zero since we didn't disparage the
|
||||
alternative in movdf_internal pattern, according to register
|
||||
allocation order, GENERAL_REGS is allocated. The patch add ? for
|
||||
alternative (r,v) and (v,r) just like we did for movsf/hf/bf_internal
|
||||
pattern, after that we get optimal RA.
|
||||
|
||||
__cond_swap:
|
||||
.LFB0:
|
||||
.cfi_startproc
|
||||
movsd (%rdi), %xmm1
|
||||
movsd (%rsi), %xmm0
|
||||
comisd %xmm1, %xmm0
|
||||
jbe .L2
|
||||
movapd %xmm1, %xmm2
|
||||
movapd %xmm0, %xmm1
|
||||
movapd %xmm2, %xmm0
|
||||
.L2:
|
||||
movsd %xmm1, (%rsi)
|
||||
movsd %xmm0, (%rdi)
|
||||
ret
|
||||
|
||||
gcc/ChangeLog:
|
||||
|
||||
PR target/110170
|
||||
* config/i386/i386.md (movdf_internal): Disparage slightly for
|
||||
2 alternatives (r,v) and (v,r) by adding constraint modifier
|
||||
'?'.
|
||||
|
||||
gcc/testsuite/ChangeLog:
|
||||
|
||||
* gcc.target/i386/pr110170-3.c: New test.
|
||||
|
||||
(cherry picked from commit 37a231cc7594d12ba0822077018aad751a6fb94e)
|
||||
---
|
||||
gcc/config/i386/i386.md | 4 ++--
|
||||
gcc/testsuite/gcc.target/i386/pr110170-3.c | 11 +++++++++++
|
||||
2 files changed, 13 insertions(+), 2 deletions(-)
|
||||
create mode 100644 gcc/testsuite/gcc.target/i386/pr110170-3.c
|
||||
|
||||
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
|
||||
index be07be10d..71691f598 100644
|
||||
--- a/gcc/config/i386/i386.md
|
||||
+++ b/gcc/config/i386/i386.md
|
||||
@@ -3582,9 +3582,9 @@
|
||||
;; Possible store forwarding (partial memory) stall in alternatives 4, 6 and 7.
|
||||
(define_insn "*movdf_internal"
|
||||
[(set (match_operand:DF 0 "nonimmediate_operand"
|
||||
- "=Yf*f,m ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,r ,v,r ,o ,r ,m")
|
||||
+ "=Yf*f,m ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,?r,?v,r ,o ,r ,m")
|
||||
(match_operand:DF 1 "general_operand"
|
||||
- "Yf*fm,Yf*f,G ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x,v,r ,roF,rF,rmF,rC"))]
|
||||
+ "Yf*fm,Yf*f,G ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x, v, r,roF,rF,rmF,rC"))]
|
||||
"!(MEM_P (operands[0]) && MEM_P (operands[1]))
|
||||
&& (lra_in_progress || reload_completed
|
||||
|| !CONST_DOUBLE_P (operands[1])
|
||||
diff --git a/gcc/testsuite/gcc.target/i386/pr110170-3.c b/gcc/testsuite/gcc.target/i386/pr110170-3.c
|
||||
new file mode 100644
|
||||
index 000000000..70daa89e9
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.target/i386/pr110170-3.c
|
||||
@@ -0,0 +1,11 @@
|
||||
+/* { dg-do compile { target { ! ia32 } } } */
|
||||
+/* { dg-options "-O2 -fno-if-conversion -fno-if-conversion2" } */
|
||||
+/* { dg-final { scan-assembler-not {(?n)movq.*r} } } */
|
||||
+
|
||||
+void __cond_swap(double* __x, double* __y) {
|
||||
+ _Bool __r = (*__x < *__y);
|
||||
+ double __tmp = __r ? *__x : *__y;
|
||||
+ *__y = __r ? *__y : *__x;
|
||||
+ *__x = __tmp;
|
||||
+}
|
||||
+
|
||||
--
|
||||
2.28.0.windows.1
|
||||
|
||||
163
0070-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch
Normal file
163
0070-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch
Normal file
@ -0,0 +1,163 @@
|
||||
From 88516507757932c1e67ce99d240596935971d2d0 Mon Sep 17 00:00:00 2001
|
||||
From: liuhongt <hongtao.liu@intel.com>
|
||||
Date: Thu, 9 Nov 2023 13:20:05 +0800
|
||||
Subject: [PATCH 15/32] Fix wrong code due to vec_merge + pcmp to blendvb
|
||||
splitter.
|
||||
|
||||
gcc/ChangeLog:
|
||||
|
||||
PR target/112443
|
||||
* config/i386/sse.md (*avx2_pcmp<mode>3_4): Fix swap condition
|
||||
from LT to GT since there's not in the pattern.
|
||||
(*avx2_pcmp<mode>3_5): Ditto.
|
||||
|
||||
gcc/testsuite/ChangeLog:
|
||||
|
||||
* g++.target/i386/pr112443.C: New test.
|
||||
|
||||
(cherry picked from commit 9a0cc04b9c9b02426762892b88efc5c44ba546bd)
|
||||
---
|
||||
gcc/config/i386/sse.md | 4 +-
|
||||
gcc/testsuite/g++.target/i386/pr112443.C | 108 +++++++++++++++++++++++
|
||||
2 files changed, 110 insertions(+), 2 deletions(-)
|
||||
create mode 100644 gcc/testsuite/g++.target/i386/pr112443.C
|
||||
|
||||
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
|
||||
index f25dd5f2b..23b858ab2 100644
|
||||
--- a/gcc/config/i386/sse.md
|
||||
+++ b/gcc/config/i386/sse.md
|
||||
@@ -16358,7 +16358,7 @@
|
||||
(match_dup 4))]
|
||||
UNSPEC_BLENDV))]
|
||||
{
|
||||
- if (INTVAL (operands[5]) == 1)
|
||||
+ if (INTVAL (operands[5]) == 5)
|
||||
std::swap (operands[1], operands[2]);
|
||||
operands[3] = gen_lowpart (<MODE>mode, operands[3]);
|
||||
})
|
||||
@@ -16388,7 +16388,7 @@
|
||||
(match_dup 4))]
|
||||
UNSPEC_BLENDV))]
|
||||
{
|
||||
- if (INTVAL (operands[5]) == 1)
|
||||
+ if (INTVAL (operands[5]) == 5)
|
||||
std::swap (operands[1], operands[2]);
|
||||
})
|
||||
|
||||
diff --git a/gcc/testsuite/g++.target/i386/pr112443.C b/gcc/testsuite/g++.target/i386/pr112443.C
|
||||
new file mode 100644
|
||||
index 000000000..ebfa9b4a7
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/g++.target/i386/pr112443.C
|
||||
@@ -0,0 +1,108 @@
|
||||
+/* { dg-do run } */
|
||||
+/* { dg-require-effective-target avx512bw } */
|
||||
+/* { dg-require-effective-target avx512vl } */
|
||||
+/* { dg-options "-O2 -std=c++17 -mavx512bw -mavx512vl" } */
|
||||
+
|
||||
+#include <cstdint>
|
||||
+#include <x86intrin.h>
|
||||
+#include <functional>
|
||||
+#include <ostream>
|
||||
+
|
||||
+#define AVX512BW
|
||||
+#define AVX512VL
|
||||
+
|
||||
+#include "avx512f-helper.h"
|
||||
+
|
||||
+struct TensorIteratorBase{
|
||||
+ char* in;
|
||||
+ char* out;
|
||||
+
|
||||
+ void for_each(std::function<void(char*, char*, int64_t size)> loop){
|
||||
+ loop(out, in, 32);
|
||||
+ }
|
||||
+};
|
||||
+
|
||||
+class Vectorized {
|
||||
+protected:
|
||||
+ __m256i values;
|
||||
+
|
||||
+ static inline __m256i invert(const __m256i& v) {
|
||||
+ const auto ones = _mm256_set1_epi64x(-1);
|
||||
+ return _mm256_xor_si256(ones, v);
|
||||
+ }
|
||||
+public:
|
||||
+ operator __m256i() const {
|
||||
+ return values;
|
||||
+ }
|
||||
+
|
||||
+ static constexpr int size() {
|
||||
+ return 32;
|
||||
+ }
|
||||
+
|
||||
+ Vectorized() {}
|
||||
+ Vectorized(__m256i v) : values(v) {}
|
||||
+ Vectorized(uint8_t v) { values = _mm256_set1_epi8(v); }
|
||||
+ static Vectorized blendv(const Vectorized& a, const Vectorized& b,
|
||||
+ const Vectorized& mask) {
|
||||
+ return _mm256_blendv_epi8(a, b, mask);
|
||||
+ }
|
||||
+ static Vectorized loadu(const void* ptr) {
|
||||
+ return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
|
||||
+ }
|
||||
+ void store(void* ptr) const {
|
||||
+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
|
||||
+ }
|
||||
+
|
||||
+ Vectorized operator<(const Vectorized& other) const {
|
||||
+ __m256i max = _mm256_max_epu8(values, other);
|
||||
+ return invert(_mm256_cmpeq_epi8(max, values));
|
||||
+ }
|
||||
+ Vectorized operator-(const Vectorized& b) {
|
||||
+ return _mm256_sub_epi8(values, b);
|
||||
+ }
|
||||
+};
|
||||
+
|
||||
+std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) {
|
||||
+ uint8_t buf[Vectorized::size()];
|
||||
+ vec.store(buf);
|
||||
+ stream << "vec[";
|
||||
+ for (int i = 0; i != Vectorized::size(); i++) {
|
||||
+ if (i != 0)
|
||||
+ stream << ", ";
|
||||
+ stream << buf[i]*1;
|
||||
+ }
|
||||
+ stream << "]";
|
||||
+ return stream;
|
||||
+}
|
||||
+
|
||||
+void run(TensorIteratorBase iter){
|
||||
+ Vectorized zero_vec(0);
|
||||
+ Vectorized one_vec(1);
|
||||
+
|
||||
+ iter.for_each([=](char* out, char* in, int64_t size) {
|
||||
+ for (int64_t i = 0; i <= size - Vectorized::size(); i += Vectorized::size()) {
|
||||
+ auto self_vec = Vectorized::loadu(in + i);
|
||||
+ auto left = Vectorized::blendv(zero_vec, one_vec, zero_vec < self_vec);
|
||||
+ auto right = Vectorized::blendv(zero_vec, one_vec, self_vec < zero_vec);
|
||||
+ auto outv = left - right;
|
||||
+ outv.store(out + i);
|
||||
+ }
|
||||
+ });
|
||||
+}
|
||||
+
|
||||
+void
|
||||
+test_256 (){
|
||||
+ char in[32];
|
||||
+ char out[32];
|
||||
+ for(auto& x: in) x = 1;
|
||||
+ run(TensorIteratorBase{in, out});
|
||||
+ Vectorized::loadu (out);
|
||||
+ for (int i = 0; i != 32; i++)
|
||||
+ if (out[i] != 1)
|
||||
+ __builtin_abort ();
|
||||
+}
|
||||
+
|
||||
+void
|
||||
+test_128 ()
|
||||
+{
|
||||
+}
|
||||
--
|
||||
2.28.0.windows.1
|
||||
|
||||
151
0071-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch
Normal file
151
0071-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch
Normal file
@ -0,0 +1,151 @@
|
||||
From 204ffa7f503411ccac0161c951726274648b6374 Mon Sep 17 00:00:00 2001
|
||||
From: liuhongt <hongtao.liu@intel.com>
|
||||
Date: Thu, 7 Dec 2023 09:17:27 +0800
|
||||
Subject: [PATCH 16/32] Don't assume it's AVX_U128_CLEAN after call_insn whose
|
||||
abi.mode_clobber(V4DImode) deosn't contains all SSE_REGS.
|
||||
|
||||
If the function desn't clobber any sse registers or only clobber
|
||||
128-bit part, then vzeroupper isn't issued before the function exit.
|
||||
the status not CLEAN but ANY after the function.
|
||||
|
||||
Also for sibling_call, it's safe to issue an vzeroupper. Also there
|
||||
could be missing vzeroupper since there's no mode_exit for
|
||||
sibling_call_p.
|
||||
|
||||
gcc/ChangeLog:
|
||||
|
||||
PR target/112891
|
||||
* config/i386/i386.cc (ix86_avx_u128_mode_after): Return
|
||||
AVX_U128_ANY if callee_abi doesn't clobber all_sse_regs to
|
||||
align with ix86_avx_u128_mode_needed.
|
||||
(ix86_avx_u128_mode_needed): Return AVX_U128_ClEAN for
|
||||
sibling_call.
|
||||
|
||||
gcc/testsuite/ChangeLog:
|
||||
|
||||
* gcc.target/i386/pr112891.c: New test.
|
||||
* gcc.target/i386/pr112891-2.c: New test.
|
||||
|
||||
(cherry picked from commit fc189a08f5b7ad5889bd4c6b320c1dd99dd5d642)
|
||||
---
|
||||
gcc/config/i386/i386.cc | 22 +++++++++++++---
|
||||
gcc/testsuite/gcc.target/i386/pr112891-2.c | 30 ++++++++++++++++++++++
|
||||
gcc/testsuite/gcc.target/i386/pr112891.c | 29 +++++++++++++++++++++
|
||||
3 files changed, 78 insertions(+), 3 deletions(-)
|
||||
create mode 100644 gcc/testsuite/gcc.target/i386/pr112891-2.c
|
||||
create mode 100644 gcc/testsuite/gcc.target/i386/pr112891.c
|
||||
|
||||
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
|
||||
index e75d37023..60f3296b0 100644
|
||||
--- a/gcc/config/i386/i386.cc
|
||||
+++ b/gcc/config/i386/i386.cc
|
||||
@@ -14416,8 +14416,12 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
|
||||
modes wider than 256 bits. It's only safe to issue a
|
||||
vzeroupper if all SSE registers are clobbered. */
|
||||
const function_abi &abi = insn_callee_abi (insn);
|
||||
- if (!hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
|
||||
- abi.mode_clobbers (V4DImode)))
|
||||
+ /* Should be safe to issue an vzeroupper before sibling_call_p.
|
||||
+ Also there not mode_exit for sibling_call, so there could be
|
||||
+ missing vzeroupper for that. */
|
||||
+ if (!(SIBLING_CALL_P (insn)
|
||||
+ || hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
|
||||
+ abi.mode_clobbers (V4DImode))))
|
||||
return AVX_U128_ANY;
|
||||
|
||||
return AVX_U128_CLEAN;
|
||||
@@ -14555,7 +14559,19 @@ ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
|
||||
bool avx_upper_reg_found = false;
|
||||
note_stores (insn, ix86_check_avx_upper_stores, &avx_upper_reg_found);
|
||||
|
||||
- return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
|
||||
+ if (avx_upper_reg_found)
|
||||
+ return AVX_U128_DIRTY;
|
||||
+
|
||||
+ /* If the function desn't clobber any sse registers or only clobber
|
||||
+ 128-bit part, Then vzeroupper isn't issued before the function exit.
|
||||
+ the status not CLEAN but ANY after the function. */
|
||||
+ const function_abi &abi = insn_callee_abi (insn);
|
||||
+ if (!(SIBLING_CALL_P (insn)
|
||||
+ || hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
|
||||
+ abi.mode_clobbers (V4DImode))))
|
||||
+ return AVX_U128_ANY;
|
||||
+
|
||||
+ return AVX_U128_CLEAN;
|
||||
}
|
||||
|
||||
/* Otherwise, return current mode. Remember that if insn
|
||||
diff --git a/gcc/testsuite/gcc.target/i386/pr112891-2.c b/gcc/testsuite/gcc.target/i386/pr112891-2.c
|
||||
new file mode 100644
|
||||
index 000000000..164c3985d
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.target/i386/pr112891-2.c
|
||||
@@ -0,0 +1,30 @@
|
||||
+/* { dg-do compile } */
|
||||
+/* { dg-options "-mavx2 -O3" } */
|
||||
+/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
|
||||
+
|
||||
+void
|
||||
+__attribute__((noinline))
|
||||
+bar (double* a)
|
||||
+{
|
||||
+ a[0] = 1.0;
|
||||
+ a[1] = 2.0;
|
||||
+}
|
||||
+
|
||||
+double
|
||||
+__attribute__((noinline))
|
||||
+foo (double* __restrict a, double* b)
|
||||
+{
|
||||
+ a[0] += b[0];
|
||||
+ a[1] += b[1];
|
||||
+ a[2] += b[2];
|
||||
+ a[3] += b[3];
|
||||
+ bar (b);
|
||||
+ return a[5] + b[5];
|
||||
+}
|
||||
+
|
||||
+double
|
||||
+foo1 (double* __restrict a, double* b)
|
||||
+{
|
||||
+ double c = foo (a, b);
|
||||
+ return __builtin_exp (c);
|
||||
+}
|
||||
diff --git a/gcc/testsuite/gcc.target/i386/pr112891.c b/gcc/testsuite/gcc.target/i386/pr112891.c
|
||||
new file mode 100644
|
||||
index 000000000..dbf6c6794
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.target/i386/pr112891.c
|
||||
@@ -0,0 +1,29 @@
|
||||
+/* { dg-do compile } */
|
||||
+/* { dg-options "-mavx2 -O3" } */
|
||||
+/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
|
||||
+
|
||||
+void
|
||||
+__attribute__((noinline))
|
||||
+bar (double* a)
|
||||
+{
|
||||
+ a[0] = 1.0;
|
||||
+ a[1] = 2.0;
|
||||
+}
|
||||
+
|
||||
+void
|
||||
+__attribute__((noinline))
|
||||
+foo (double* __restrict a, double* b)
|
||||
+{
|
||||
+ a[0] += b[0];
|
||||
+ a[1] += b[1];
|
||||
+ a[2] += b[2];
|
||||
+ a[3] += b[3];
|
||||
+ bar (b);
|
||||
+}
|
||||
+
|
||||
+double
|
||||
+foo1 (double* __restrict a, double* b)
|
||||
+{
|
||||
+ foo (a, b);
|
||||
+ return __builtin_exp (b[1]);
|
||||
+}
|
||||
--
|
||||
2.28.0.windows.1
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user