[Sync] Sync patch from openeuler/gcc
This commit is contained in:
parent
e487f0b8ff
commit
25bccb60b0
135
0058-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ-.patch
Normal file
135
0058-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ-.patch
Normal file
@ -0,0 +1,135 @@
|
||||
From 1649f9fbbc5267de2a675336d3ac665528a03db8 Mon Sep 17 00:00:00 2001
|
||||
From: liuhongt <hongtao.liu@intel.com>
|
||||
Date: Wed, 10 May 2023 15:16:58 +0800
|
||||
Subject: [PATCH 03/32] x86: Add a new option -mdaz-ftz to enable FTZ and DAZ
|
||||
flags in MXCSR.
|
||||
|
||||
if (mdaz-ftz)
|
||||
link crtfastmath.o
|
||||
else if ((Ofast || ffast-math || funsafe-math-optimizations)
|
||||
&& !mno-daz-ftz)
|
||||
link crtfastmath.o
|
||||
else
|
||||
Don't link crtfastmath.o
|
||||
|
||||
gcc/ChangeLog:
|
||||
|
||||
* config/i386/cygwin.h (ENDFILE_SPEC): Link crtfastmath.o
|
||||
whenever -mdaz-ftz is specified. Don't link crtfastmath.o
|
||||
when -mno-daz-ftz is specified.
|
||||
* config/i386/darwin.h (ENDFILE_SPEC): Ditto.
|
||||
* config/i386/gnu-user-common.h
|
||||
(GNU_USER_TARGET_MATHFILE_SPEC): Ditto.
|
||||
* config/i386/mingw32.h (ENDFILE_SPEC): Ditto.
|
||||
* config/i386/i386.opt (mdaz-ftz): New option.
|
||||
* doc/invoke.texi (x86 options): Document mftz-daz.
|
||||
---
|
||||
gcc/config/i386/cygwin.h | 2 +-
|
||||
gcc/config/i386/darwin.h | 4 ++--
|
||||
gcc/config/i386/gnu-user-common.h | 2 +-
|
||||
gcc/config/i386/i386.opt | 4 ++++
|
||||
gcc/config/i386/mingw32.h | 2 +-
|
||||
gcc/doc/invoke.texi | 11 ++++++++++-
|
||||
6 files changed, 19 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/gcc/config/i386/cygwin.h b/gcc/config/i386/cygwin.h
|
||||
index d06eda369..5412c5d44 100644
|
||||
--- a/gcc/config/i386/cygwin.h
|
||||
+++ b/gcc/config/i386/cygwin.h
|
||||
@@ -57,7 +57,7 @@ along with GCC; see the file COPYING3. If not see
|
||||
|
||||
#undef ENDFILE_SPEC
|
||||
#define ENDFILE_SPEC \
|
||||
- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}\
|
||||
+ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \
|
||||
%{!shared:%:if-exists(default-manifest.o%s)}\
|
||||
%{fvtable-verify=none:%s; \
|
||||
fvtable-verify=preinit:vtv_end.o%s; \
|
||||
diff --git a/gcc/config/i386/darwin.h b/gcc/config/i386/darwin.h
|
||||
index a55f6b2b8..2f773924d 100644
|
||||
--- a/gcc/config/i386/darwin.h
|
||||
+++ b/gcc/config/i386/darwin.h
|
||||
@@ -109,8 +109,8 @@ along with GCC; see the file COPYING3. If not see
|
||||
"%{!force_cpusubtype_ALL:-force_cpusubtype_ALL} "
|
||||
|
||||
#undef ENDFILE_SPEC
|
||||
-#define ENDFILE_SPEC \
|
||||
- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
|
||||
+#define ENDFILE_SPEC
|
||||
+\ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \
|
||||
%{mpc32:crtprec32.o%s} \
|
||||
%{mpc64:crtprec64.o%s} \
|
||||
%{mpc80:crtprec80.o%s}" TM_DESTRUCTOR
|
||||
diff --git a/gcc/config/i386/gnu-user-common.h b/gcc/config/i386/gnu-user-common.h
|
||||
index 23b54c5be..3d2a33f17 100644
|
||||
--- a/gcc/config/i386/gnu-user-common.h
|
||||
+++ b/gcc/config/i386/gnu-user-common.h
|
||||
@@ -47,7 +47,7 @@ along with GCC; see the file COPYING3. If not see
|
||||
|
||||
/* Similar to standard GNU userspace, but adding -ffast-math support. */
|
||||
#define GNU_USER_TARGET_MATHFILE_SPEC \
|
||||
- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
|
||||
+ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \
|
||||
%{mpc32:crtprec32.o%s} \
|
||||
%{mpc64:crtprec64.o%s} \
|
||||
%{mpc80:crtprec80.o%s}"
|
||||
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
|
||||
index fc1b944ac..498fb454d 100644
|
||||
--- a/gcc/config/i386/i386.opt
|
||||
+++ b/gcc/config/i386/i386.opt
|
||||
@@ -420,6 +420,10 @@ mpc80
|
||||
Target RejectNegative
|
||||
Set 80387 floating-point precision to 80-bit.
|
||||
|
||||
+mdaz-ftz
|
||||
+Target
|
||||
+Set the FTZ and DAZ Flags.
|
||||
+
|
||||
mpreferred-stack-boundary=
|
||||
Target RejectNegative Joined UInteger Var(ix86_preferred_stack_boundary_arg)
|
||||
Attempt to keep stack aligned to this power of 2.
|
||||
diff --git a/gcc/config/i386/mingw32.h b/gcc/config/i386/mingw32.h
|
||||
index d3ca0cd02..ddbe6a405 100644
|
||||
--- a/gcc/config/i386/mingw32.h
|
||||
+++ b/gcc/config/i386/mingw32.h
|
||||
@@ -197,7 +197,7 @@ along with GCC; see the file COPYING3. If not see
|
||||
|
||||
#undef ENDFILE_SPEC
|
||||
#define ENDFILE_SPEC \
|
||||
- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
|
||||
+ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \
|
||||
%{!shared:%:if-exists(default-manifest.o%s)}\
|
||||
%{fvtable-verify=none:%s; \
|
||||
fvtable-verify=preinit:vtv_end.o%s; \
|
||||
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
|
||||
index 2b376e0e9..3a48655e5 100644
|
||||
--- a/gcc/doc/invoke.texi
|
||||
+++ b/gcc/doc/invoke.texi
|
||||
@@ -1437,7 +1437,7 @@ See RS/6000 and PowerPC Options.
|
||||
-m96bit-long-double -mlong-double-64 -mlong-double-80 -mlong-double-128 @gol
|
||||
-mregparm=@var{num} -msseregparm @gol
|
||||
-mveclibabi=@var{type} -mvect8-ret-in-mem @gol
|
||||
--mpc32 -mpc64 -mpc80 -mstackrealign @gol
|
||||
+-mpc32 -mpc64 -mpc80 -mdaz-ftz -mstackrealign @gol
|
||||
-momit-leaf-frame-pointer -mno-red-zone -mno-tls-direct-seg-refs @gol
|
||||
-mcmodel=@var{code-model} -mabi=@var{name} -maddress-mode=@var{mode} @gol
|
||||
-m32 -m64 -mx32 -m16 -miamcu -mlarge-data-threshold=@var{num} @gol
|
||||
@@ -32122,6 +32122,15 @@ are enabled by default; routines in such libraries could suffer significant
|
||||
loss of accuracy, typically through so-called ``catastrophic cancellation'',
|
||||
when this option is used to set the precision to less than extended precision.
|
||||
|
||||
+@item -mdaz-ftz
|
||||
+@opindex mdaz-ftz
|
||||
+
|
||||
+The flush-to-zero (FTZ) and denormals-are-zero (DAZ) flags in the MXCSR register
|
||||
+are used to control floating-point calculations.SSE and AVX instructions
|
||||
+including scalar and vector instructions could benefit from enabling the FTZ
|
||||
+and DAZ flags when @option{-mdaz-ftz} is specified. Don't set FTZ/DAZ flags
|
||||
+when @option{-mno-daz-ftz} is specified.
|
||||
+
|
||||
@item -mstackrealign
|
||||
@opindex mstackrealign
|
||||
Realign the stack at entry. On the x86, the @option{-mstackrealign}
|
||||
--
|
||||
2.28.0.windows.1
|
||||
|
||||
@ -0,0 +1,65 @@
|
||||
From e70fa730dcfcb3a7b1d56a2e166752d4299f0504 Mon Sep 17 00:00:00 2001
|
||||
From: liuhongt <hongtao.liu@intel.com>
|
||||
Date: Mon, 5 Jun 2023 12:38:41 +0800
|
||||
Subject: [PATCH 04/32] Explicitly view_convert_expr mask to signed type when
|
||||
folding pblendvb builtins.
|
||||
|
||||
Since mask < 0 will be always false for vector char when
|
||||
-funsigned-char, but vpblendvb needs to check the most significant
|
||||
bit. The patch explicitly VCE to vector signed char.
|
||||
|
||||
gcc/ChangeLog:
|
||||
|
||||
PR target/110108
|
||||
* config/i386/i386.cc (ix86_gimple_fold_builtin): Explicitly
|
||||
view_convert_expr mask to signed type when folding pblendvb
|
||||
builtins.
|
||||
|
||||
gcc/testsuite/ChangeLog:
|
||||
|
||||
* gcc.target/i386/pr110108-2.c: New test.
|
||||
---
|
||||
gcc/config/i386/i386.cc | 4 +++-
|
||||
gcc/testsuite/gcc.target/i386/pr110108-2.c | 14 ++++++++++++++
|
||||
2 files changed, 17 insertions(+), 1 deletion(-)
|
||||
create mode 100644 gcc/testsuite/gcc.target/i386/pr110108-2.c
|
||||
|
||||
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
|
||||
index 462dce10e..479fc6010 100644
|
||||
--- a/gcc/config/i386/i386.cc
|
||||
+++ b/gcc/config/i386/i386.cc
|
||||
@@ -18396,8 +18396,10 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
|
||||
tree itype = GET_MODE_INNER (TYPE_MODE (type)) == E_SFmode
|
||||
? intSI_type_node : intDI_type_node;
|
||||
type = get_same_sized_vectype (itype, type);
|
||||
- arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2);
|
||||
}
|
||||
+ else
|
||||
+ type = signed_type_for (type);
|
||||
+ arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2);
|
||||
tree zero_vec = build_zero_cst (type);
|
||||
tree cmp_type = truth_type_for (type);
|
||||
tree cmp = gimple_build (&stmts, LT_EXPR, cmp_type, arg2, zero_vec);
|
||||
diff --git a/gcc/testsuite/gcc.target/i386/pr110108-2.c b/gcc/testsuite/gcc.target/i386/pr110108-2.c
|
||||
new file mode 100644
|
||||
index 000000000..2d1d2fd49
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.target/i386/pr110108-2.c
|
||||
@@ -0,0 +1,14 @@
|
||||
+/* { dg-do compile } */
|
||||
+/* { dg-options "-mavx2 -O2 -funsigned-char" } */
|
||||
+/* { dg-final { scan-assembler-times "vpblendvb" 2 } } */
|
||||
+
|
||||
+#include <immintrin.h>
|
||||
+__m128i do_stuff_128(__m128i X0, __m128i X1, __m128i X2) {
|
||||
+ __m128i Result = _mm_blendv_epi8(X0, X1, X2);
|
||||
+ return Result;
|
||||
+}
|
||||
+
|
||||
+__m256i do_stuff_256(__m256i X0, __m256i X1, __m256i X2) {
|
||||
+ __m256i Result = _mm256_blendv_epi8(X0, X1, X2);
|
||||
+ return Result;
|
||||
+}
|
||||
--
|
||||
2.28.0.windows.1
|
||||
|
||||
138
0060-Make-option-mvzeroupper-independent-of-optimization-.patch
Normal file
138
0060-Make-option-mvzeroupper-independent-of-optimization-.patch
Normal file
@ -0,0 +1,138 @@
|
||||
From 48715f03ad08f185153bfb0ff4c0802ab2d9579c Mon Sep 17 00:00:00 2001
|
||||
From: liuhongt <hongtao.liu@intel.com>
|
||||
Date: Mon, 26 Jun 2023 09:50:25 +0800
|
||||
Subject: [PATCH 05/32] Make option mvzeroupper independent of optimization
|
||||
level.
|
||||
|
||||
pass_insert_vzeroupper is under condition
|
||||
|
||||
TARGET_AVX && TARGET_VZEROUPPER
|
||||
&& flag_expensive_optimizations && !optimize_size
|
||||
|
||||
But the document of mvzeroupper doesn't mention the insertion
|
||||
required -O2 and above, it may confuse users when they explicitly
|
||||
use -Os -mvzeroupper.
|
||||
|
||||
------------
|
||||
mvzeroupper
|
||||
Target Mask(VZEROUPPER) Save
|
||||
Generate vzeroupper instruction before a transfer of control flow out of
|
||||
the function.
|
||||
------------
|
||||
|
||||
The patch moves flag_expensive_optimizations && !optimize_size to
|
||||
ix86_option_override_internal. It makes -mvzeroupper independent of
|
||||
optimization level, but still keeps the behavior of architecture
|
||||
tuning(emit_vzeroupper) unchanged.
|
||||
|
||||
gcc/ChangeLog:
|
||||
|
||||
* config/i386/i386-features.cc (pass_insert_vzeroupper:gate):
|
||||
Move flag_expensive_optimizations && !optimize_size to ..
|
||||
* config/i386/i386-options.cc (ix86_option_override_internal):
|
||||
.. this, it makes -mvzeroupper independent of optimization
|
||||
level, but still keeps the behavior of architecture
|
||||
tuning(emit_vzeroupper) unchanged.
|
||||
|
||||
gcc/testsuite/ChangeLog:
|
||||
|
||||
* gcc.target/i386/avx-vzeroupper-29.c: New testcase.
|
||||
* gcc.target/i386/avx-vzeroupper-12.c: Adjust testcase.
|
||||
* gcc.target/i386/avx-vzeroupper-7.c: Ditto.
|
||||
* gcc.target/i386/avx-vzeroupper-9.c: Ditto.
|
||||
---
|
||||
gcc/config/i386/i386-features.cc | 3 +--
|
||||
gcc/config/i386/i386-options.cc | 4 +++-
|
||||
gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c | 3 ++-
|
||||
gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c | 14 ++++++++++++++
|
||||
gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c | 3 ++-
|
||||
gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c | 3 ++-
|
||||
6 files changed, 24 insertions(+), 6 deletions(-)
|
||||
create mode 100644 gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c
|
||||
|
||||
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
|
||||
index 6fe41c3c2..6a2444eb6 100644
|
||||
--- a/gcc/config/i386/i386-features.cc
|
||||
+++ b/gcc/config/i386/i386-features.cc
|
||||
@@ -1875,8 +1875,7 @@ public:
|
||||
/* opt_pass methods: */
|
||||
virtual bool gate (function *)
|
||||
{
|
||||
- return TARGET_AVX && TARGET_VZEROUPPER
|
||||
- && flag_expensive_optimizations && !optimize_size;
|
||||
+ return TARGET_AVX && TARGET_VZEROUPPER;
|
||||
}
|
||||
|
||||
virtual unsigned int execute (function *)
|
||||
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
|
||||
index ff44ad4e0..74e969b68 100644
|
||||
--- a/gcc/config/i386/i386-options.cc
|
||||
+++ b/gcc/config/i386/i386-options.cc
|
||||
@@ -2702,7 +2702,9 @@ ix86_option_override_internal (bool main_args_p,
|
||||
sorry ("%<-mcall-ms2sysv-xlogues%> isn%'t currently supported with SEH");
|
||||
|
||||
if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
|
||||
- && TARGET_EMIT_VZEROUPPER)
|
||||
+ && TARGET_EMIT_VZEROUPPER
|
||||
+ && flag_expensive_optimizations
|
||||
+ && !optimize_size)
|
||||
opts->x_target_flags |= MASK_VZEROUPPER;
|
||||
if (!(opts_set->x_target_flags & MASK_STV))
|
||||
opts->x_target_flags |= MASK_STV;
|
||||
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c
|
||||
index e694d4048..5a40e8783 100644
|
||||
--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c
|
||||
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c
|
||||
@@ -16,5 +16,6 @@ foo ()
|
||||
_mm256_zeroupper ();
|
||||
}
|
||||
|
||||
-/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 } } */
|
||||
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 { target ia32 } } } */
|
||||
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 5 { target { ! ia32 } } } } */
|
||||
/* { dg-final { scan-assembler-times "\\*avx_vzeroall" 1 } } */
|
||||
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c
|
||||
new file mode 100644
|
||||
index 000000000..4af637757
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c
|
||||
@@ -0,0 +1,14 @@
|
||||
+/* { dg-do compile } */
|
||||
+/* { dg-options "-O0 -mavx -mtune=generic -mvzeroupper -dp" } */
|
||||
+
|
||||
+#include <immintrin.h>
|
||||
+
|
||||
+extern __m256 x, y;
|
||||
+
|
||||
+void
|
||||
+foo ()
|
||||
+{
|
||||
+ x = y;
|
||||
+}
|
||||
+
|
||||
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */
|
||||
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c
|
||||
index ab6d68779..75fe58897 100644
|
||||
--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c
|
||||
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c
|
||||
@@ -12,4 +12,5 @@ foo ()
|
||||
_mm256_zeroupper ();
|
||||
}
|
||||
|
||||
-/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */
|
||||
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 { target ia32 } } } */
|
||||
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 2 { target { ! ia32 } } } } */
|
||||
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c
|
||||
index 974e1626a..fa0a6dfca 100644
|
||||
--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c
|
||||
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c
|
||||
@@ -15,4 +15,5 @@ foo ()
|
||||
_mm256_zeroupper ();
|
||||
}
|
||||
|
||||
-/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 } } */
|
||||
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 { target ia32 } } } */
|
||||
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 5 { target { ! ia32 } } } } */
|
||||
--
|
||||
2.28.0.windows.1
|
||||
|
||||
@ -0,0 +1,68 @@
|
||||
From 8039d773354360ed8ff2f25c63843fc637eacc67 Mon Sep 17 00:00:00 2001
|
||||
From: Hongyu Wang <hongyu.wang@intel.com>
|
||||
Date: Sun, 25 Jun 2023 09:50:21 +0800
|
||||
Subject: [PATCH 06/32] i386: Sync tune_string with arch_string for target
|
||||
attribute
|
||||
|
||||
arch=*
|
||||
|
||||
For function with target attribute arch=*, current logic will set its
|
||||
tune to -mtune from command line so all target_clones will get same
|
||||
tuning flags which would affect the performance for each clone. Override
|
||||
tune with arch if tune was not explicitly specified to get proper tuning
|
||||
flags for target_clones.
|
||||
|
||||
gcc/ChangeLog:
|
||||
|
||||
* config/i386/i386-options.cc (ix86_valid_target_attribute_tree):
|
||||
Override tune_string with arch_string if tune_string is not
|
||||
explicitly specified.
|
||||
|
||||
gcc/testsuite/ChangeLog:
|
||||
|
||||
* gcc.target/i386/mvc17.c: New test.
|
||||
|
||||
(cherry picked from commit 2916278d14e9ac28c361c396a67256acbebda6e8)
|
||||
---
|
||||
gcc/config/i386/i386-options.cc | 6 +++++-
|
||||
gcc/testsuite/gcc.target/i386/mvc17.c | 11 +++++++++++
|
||||
2 files changed, 16 insertions(+), 1 deletion(-)
|
||||
create mode 100644 gcc/testsuite/gcc.target/i386/mvc17.c
|
||||
|
||||
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
|
||||
index 74e969b68..fb2ed942f 100644
|
||||
--- a/gcc/config/i386/i386-options.cc
|
||||
+++ b/gcc/config/i386/i386-options.cc
|
||||
@@ -1378,7 +1378,11 @@ ix86_valid_target_attribute_tree (tree fndecl, tree args,
|
||||
if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
|
||||
opts->x_ix86_tune_string
|
||||
= ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
|
||||
- else if (orig_tune_defaulted)
|
||||
+ /* If we have explicit arch string and no tune string specified, set
|
||||
+ tune_string to NULL and later it will be overriden by arch_string
|
||||
+ so target clones can get proper optimization. */
|
||||
+ else if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
|
||||
+ || orig_tune_defaulted)
|
||||
opts->x_ix86_tune_string = NULL;
|
||||
|
||||
/* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
|
||||
diff --git a/gcc/testsuite/gcc.target/i386/mvc17.c b/gcc/testsuite/gcc.target/i386/mvc17.c
|
||||
new file mode 100644
|
||||
index 000000000..8b83c1aec
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.target/i386/mvc17.c
|
||||
@@ -0,0 +1,11 @@
|
||||
+/* { dg-do compile } */
|
||||
+/* { dg-require-ifunc "" } */
|
||||
+/* { dg-options "-O2 -march=x86-64" } */
|
||||
+/* { dg-final { scan-assembler-times "rep mov" 1 } } */
|
||||
+
|
||||
+__attribute__((target_clones("default","arch=icelake-server")))
|
||||
+void
|
||||
+foo (char *a, char *b, int size)
|
||||
+{
|
||||
+ __builtin_memcpy (a, b, size & 0x7F);
|
||||
+}
|
||||
--
|
||||
2.28.0.windows.1
|
||||
|
||||
111
0062-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch
Normal file
111
0062-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch
Normal file
@ -0,0 +1,111 @@
|
||||
From fbcb1a5899b1bd3964aed78ed74041121e618d36 Mon Sep 17 00:00:00 2001
|
||||
From: liuhongt <hongtao.liu@intel.com>
|
||||
Date: Tue, 20 Jun 2023 15:41:00 +0800
|
||||
Subject: [PATCH 07/32] Refine maskloadmn pattern with UNSPEC_MASKLOAD.
|
||||
|
||||
If mem_addr points to a memory region with less than whole vector size
|
||||
bytes of accessible memory and k is a mask that would prevent reading
|
||||
the inaccessible bytes from mem_addr, add UNSPEC_MASKLOAD to prevent
|
||||
it to be transformed to vpblendd.
|
||||
|
||||
gcc/ChangeLog:
|
||||
|
||||
PR target/110309
|
||||
* config/i386/sse.md (maskload<mode><avx512fmaskmodelower>):
|
||||
Refine pattern with UNSPEC_MASKLOAD.
|
||||
(maskload<mode><avx512fmaskmodelower>): Ditto.
|
||||
(*<avx512>_load<mode>_mask): Extend mode iterator to
|
||||
VI12HF_AVX512VL.
|
||||
(*<avx512>_load<mode>): Ditto.
|
||||
|
||||
gcc/testsuite/ChangeLog:
|
||||
|
||||
* gcc.target/i386/pr110309.c: New test.
|
||||
---
|
||||
gcc/config/i386/sse.md | 32 +++++++++++++-----------
|
||||
gcc/testsuite/gcc.target/i386/pr110309.c | 10 ++++++++
|
||||
2 files changed, 28 insertions(+), 14 deletions(-)
|
||||
create mode 100644 gcc/testsuite/gcc.target/i386/pr110309.c
|
||||
|
||||
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
|
||||
index eb767e56c..b30e96cb1 100644
|
||||
--- a/gcc/config/i386/sse.md
|
||||
+++ b/gcc/config/i386/sse.md
|
||||
@@ -1411,12 +1411,12 @@
|
||||
})
|
||||
|
||||
(define_insn "*<avx512>_load<mode>_mask"
|
||||
- [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
|
||||
- (vec_merge:VI12_AVX512VL
|
||||
- (unspec:VI12_AVX512VL
|
||||
- [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")]
|
||||
+ [(set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v")
|
||||
+ (vec_merge:VI12HF_AVX512VL
|
||||
+ (unspec:VI12HF_AVX512VL
|
||||
+ [(match_operand:VI12HF_AVX512VL 1 "memory_operand" "m")]
|
||||
UNSPEC_MASKLOAD)
|
||||
- (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "0C")
|
||||
+ (match_operand:VI12HF_AVX512VL 2 "nonimm_or_0_operand" "0C")
|
||||
(match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))]
|
||||
"TARGET_AVX512BW"
|
||||
"vmovdqu<ssescalarsize>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
|
||||
@@ -1425,9 +1425,9 @@
|
||||
(set_attr "mode" "<sseinsnmode>")])
|
||||
|
||||
(define_insn_and_split "*<avx512>_load<mode>"
|
||||
- [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
|
||||
- (unspec:VI12_AVX512VL
|
||||
- [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")]
|
||||
+ [(set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v")
|
||||
+ (unspec:VI12HF_AVX512VL
|
||||
+ [(match_operand:VI12HF_AVX512VL 1 "memory_operand" "m")]
|
||||
UNSPEC_MASKLOAD))]
|
||||
"TARGET_AVX512BW"
|
||||
"#"
|
||||
@@ -25973,17 +25973,21 @@
|
||||
"TARGET_AVX")
|
||||
|
||||
(define_expand "maskload<mode><avx512fmaskmodelower>"
|
||||
- [(set (match_operand:V48H_AVX512VL 0 "register_operand")
|
||||
- (vec_merge:V48H_AVX512VL
|
||||
- (match_operand:V48H_AVX512VL 1 "memory_operand")
|
||||
+ [(set (match_operand:V48_AVX512VL 0 "register_operand")
|
||||
+ (vec_merge:V48_AVX512VL
|
||||
+ (unspec:V48_AVX512VL
|
||||
+ [(match_operand:V48_AVX512VL 1 "memory_operand")]
|
||||
+ UNSPEC_MASKLOAD)
|
||||
(match_dup 0)
|
||||
(match_operand:<avx512fmaskmode> 2 "register_operand")))]
|
||||
"TARGET_AVX512F")
|
||||
|
||||
(define_expand "maskload<mode><avx512fmaskmodelower>"
|
||||
- [(set (match_operand:VI12_AVX512VL 0 "register_operand")
|
||||
- (vec_merge:VI12_AVX512VL
|
||||
- (match_operand:VI12_AVX512VL 1 "memory_operand")
|
||||
+ [(set (match_operand:VI12HF_AVX512VL 0 "register_operand")
|
||||
+ (vec_merge:VI12HF_AVX512VL
|
||||
+ (unspec:VI12HF_AVX512VL
|
||||
+ [(match_operand:VI12HF_AVX512VL 1 "memory_operand")]
|
||||
+ UNSPEC_MASKLOAD)
|
||||
(match_dup 0)
|
||||
(match_operand:<avx512fmaskmode> 2 "register_operand")))]
|
||||
"TARGET_AVX512BW")
|
||||
diff --git a/gcc/testsuite/gcc.target/i386/pr110309.c b/gcc/testsuite/gcc.target/i386/pr110309.c
|
||||
new file mode 100644
|
||||
index 000000000..f6e9e9c3c
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.target/i386/pr110309.c
|
||||
@@ -0,0 +1,10 @@
|
||||
+/* { dg-do compile } */
|
||||
+/* { dg-options "-O3 --param vect-partial-vector-usage=1 -march=znver4 -mprefer-vector-width=256" } */
|
||||
+/* { dg-final { scan-assembler-not {(?n)vpblendd.*ymm} } } */
|
||||
+
|
||||
+
|
||||
+void foo (int * __restrict a, int *b)
|
||||
+{
|
||||
+ for (int i = 0; i < 6; ++i)
|
||||
+ a[i] = b[i] + 42;
|
||||
+}
|
||||
--
|
||||
2.28.0.windows.1
|
||||
|
||||
126
0063-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch
Normal file
126
0063-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch
Normal file
@ -0,0 +1,126 @@
|
||||
From 5ad28ef4010c1248b4d94396d03f863705f7b0db Mon Sep 17 00:00:00 2001
|
||||
From: liuhongt <hongtao.liu@intel.com>
|
||||
Date: Mon, 26 Jun 2023 21:07:09 +0800
|
||||
Subject: [PATCH 08/32] Refine maskstore patterns with UNSPEC_MASKMOV.
|
||||
|
||||
Similar like r14-2070-gc79476da46728e
|
||||
|
||||
If mem_addr points to a memory region with less than whole vector size
|
||||
bytes of accessible memory and k is a mask that would prevent reading
|
||||
the inaccessible bytes from mem_addr, add UNSPEC_MASKMOV to prevent
|
||||
it to be transformed to any other whole memory access instructions.
|
||||
|
||||
gcc/ChangeLog:
|
||||
|
||||
PR rtl-optimization/110237
|
||||
* config/i386/sse.md (<avx512>_store<mode>_mask): Refine with
|
||||
UNSPEC_MASKMOV.
|
||||
(maskstore<mode><avx512fmaskmodelower): Ditto.
|
||||
(*<avx512>_store<mode>_mask): New define_insn, it's renamed
|
||||
from original <avx512>_store<mode>_mask.
|
||||
---
|
||||
gcc/config/i386/sse.md | 69 ++++++++++++++++++++++++++++++++++--------
|
||||
1 file changed, 57 insertions(+), 12 deletions(-)
|
||||
|
||||
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
|
||||
index b30e96cb1..3af159896 100644
|
||||
--- a/gcc/config/i386/sse.md
|
||||
+++ b/gcc/config/i386/sse.md
|
||||
@@ -1554,7 +1554,7 @@
|
||||
(set_attr "prefix" "evex")
|
||||
(set_attr "mode" "<sseinsnmode>")])
|
||||
|
||||
-(define_insn "<avx512>_store<mode>_mask"
|
||||
+(define_insn "*<avx512>_store<mode>_mask"
|
||||
[(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m")
|
||||
(vec_merge:V48_AVX512VL
|
||||
(match_operand:V48_AVX512VL 1 "register_operand" "v")
|
||||
@@ -1582,7 +1582,7 @@
|
||||
(set_attr "memory" "store")
|
||||
(set_attr "mode" "<sseinsnmode>")])
|
||||
|
||||
-(define_insn "<avx512>_store<mode>_mask"
|
||||
+(define_insn "*<avx512>_store<mode>_mask"
|
||||
[(set (match_operand:VI12HF_AVX512VL 0 "memory_operand" "=m")
|
||||
(vec_merge:VI12HF_AVX512VL
|
||||
(match_operand:VI12HF_AVX512VL 1 "register_operand" "v")
|
||||
@@ -26002,21 +26002,66 @@
|
||||
"TARGET_AVX")
|
||||
|
||||
(define_expand "maskstore<mode><avx512fmaskmodelower>"
|
||||
- [(set (match_operand:V48H_AVX512VL 0 "memory_operand")
|
||||
- (vec_merge:V48H_AVX512VL
|
||||
- (match_operand:V48H_AVX512VL 1 "register_operand")
|
||||
- (match_dup 0)
|
||||
- (match_operand:<avx512fmaskmode> 2 "register_operand")))]
|
||||
+ [(set (match_operand:V48_AVX512VL 0 "memory_operand")
|
||||
+ (unspec:V48_AVX512VL
|
||||
+ [(match_operand:V48_AVX512VL 1 "register_operand")
|
||||
+ (match_dup 0)
|
||||
+ (match_operand:<avx512fmaskmode> 2 "register_operand")]
|
||||
+ UNSPEC_MASKMOV))]
|
||||
"TARGET_AVX512F")
|
||||
|
||||
(define_expand "maskstore<mode><avx512fmaskmodelower>"
|
||||
- [(set (match_operand:VI12_AVX512VL 0 "memory_operand")
|
||||
- (vec_merge:VI12_AVX512VL
|
||||
- (match_operand:VI12_AVX512VL 1 "register_operand")
|
||||
- (match_dup 0)
|
||||
- (match_operand:<avx512fmaskmode> 2 "register_operand")))]
|
||||
+ [(set (match_operand:VI12HF_AVX512VL 0 "memory_operand")
|
||||
+ (unspec:VI12HF_AVX512VL
|
||||
+ [(match_operand:VI12HF_AVX512VL 1 "register_operand")
|
||||
+ (match_dup 0)
|
||||
+ (match_operand:<avx512fmaskmode> 2 "register_operand")]
|
||||
+ UNSPEC_MASKMOV))]
|
||||
"TARGET_AVX512BW")
|
||||
|
||||
+(define_insn "<avx512>_store<mode>_mask"
|
||||
+ [(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m")
|
||||
+ (unspec:V48_AVX512VL
|
||||
+ [(match_operand:V48_AVX512VL 1 "register_operand" "v")
|
||||
+ (match_dup 0)
|
||||
+ (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")]
|
||||
+ UNSPEC_MASKMOV))]
|
||||
+ "TARGET_AVX512F"
|
||||
+{
|
||||
+ if (FLOAT_MODE_P (GET_MODE_INNER (<MODE>mode)))
|
||||
+ {
|
||||
+ if (misaligned_operand (operands[0], <MODE>mode))
|
||||
+ return "vmovu<ssemodesuffix>\t{%1, %0%{%2%}|%0%{%2%}, %1}";
|
||||
+ else
|
||||
+ return "vmova<ssemodesuffix>\t{%1, %0%{%2%}|%0%{%2%}, %1}";
|
||||
+ }
|
||||
+ else
|
||||
+ {
|
||||
+ if (misaligned_operand (operands[0], <MODE>mode))
|
||||
+ return "vmovdqu<ssescalarsize>\t{%1, %0%{%2%}|%0%{%2%}, %1}";
|
||||
+ else
|
||||
+ return "vmovdqa<ssescalarsize>\t{%1, %0%{%2%}|%0%{%2%}, %1}";
|
||||
+ }
|
||||
+}
|
||||
+ [(set_attr "type" "ssemov")
|
||||
+ (set_attr "prefix" "evex")
|
||||
+ (set_attr "memory" "store")
|
||||
+ (set_attr "mode" "<sseinsnmode>")])
|
||||
+
|
||||
+(define_insn "<avx512>_store<mode>_mask"
|
||||
+ [(set (match_operand:VI12HF_AVX512VL 0 "memory_operand" "=m")
|
||||
+ (unspec:VI12HF_AVX512VL
|
||||
+ [(match_operand:VI12HF_AVX512VL 1 "register_operand" "v")
|
||||
+ (match_dup 0)
|
||||
+ (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")]
|
||||
+ UNSPEC_MASKMOV))]
|
||||
+ "TARGET_AVX512BW"
|
||||
+ "vmovdqu<ssescalarsize>\t{%1, %0%{%2%}|%0%{%2%}, %1}"
|
||||
+ [(set_attr "type" "ssemov")
|
||||
+ (set_attr "prefix" "evex")
|
||||
+ (set_attr "memory" "store")
|
||||
+ (set_attr "mode" "<sseinsnmode>")])
|
||||
+
|
||||
(define_expand "cbranch<mode>4"
|
||||
[(set (reg:CC FLAGS_REG)
|
||||
(compare:CC (match_operand:VI48_AVX 1 "register_operand")
|
||||
--
|
||||
2.28.0.windows.1
|
||||
|
||||
@ -0,0 +1,38 @@
|
||||
From 50757adc93ef32a97a8a1083f5d53a9c00da6ac8 Mon Sep 17 00:00:00 2001
|
||||
From: "Cui, Lili" <lili.cui@intel.com>
|
||||
Date: Thu, 29 Jun 2023 03:10:35 +0000
|
||||
Subject: [PATCH 09/32] x86: Update model values for Alderlake and Rocketlake.
|
||||
|
||||
Update model values for Alderlake and Rocketlake according to SDM.
|
||||
|
||||
gcc/ChangeLog
|
||||
|
||||
* common/config/i386/cpuinfo.h (get_intel_cpu): Remove model value 0xa8
|
||||
from Rocketlake, remove model value 0xbf from Alderlake.
|
||||
---
|
||||
gcc/common/config/i386/cpuinfo.h | 2 --
|
||||
1 file changed, 2 deletions(-)
|
||||
|
||||
diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
|
||||
index 0333da56b..28b2ff0b0 100644
|
||||
--- a/gcc/common/config/i386/cpuinfo.h
|
||||
+++ b/gcc/common/config/i386/cpuinfo.h
|
||||
@@ -435,7 +435,6 @@ get_intel_cpu (struct __processor_model *cpu_model,
|
||||
cpu_model->__cpu_subtype = INTEL_COREI7_SKYLAKE;
|
||||
break;
|
||||
case 0xa7:
|
||||
- case 0xa8:
|
||||
/* Rocket Lake. */
|
||||
cpu = "rocketlake";
|
||||
CHECK___builtin_cpu_is ("corei7");
|
||||
@@ -508,7 +507,6 @@ get_intel_cpu (struct __processor_model *cpu_model,
|
||||
break;
|
||||
case 0x97:
|
||||
case 0x9a:
|
||||
- case 0xbf:
|
||||
/* Alder Lake. */
|
||||
cpu = "alderlake";
|
||||
CHECK___builtin_cpu_is ("corei7");
|
||||
--
|
||||
2.28.0.windows.1
|
||||
|
||||
78
0065-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch
Normal file
78
0065-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch
Normal file
@ -0,0 +1,78 @@
|
||||
From 60364b439a80c217174e1830e0b7507d6f4538c4 Mon Sep 17 00:00:00 2001
|
||||
From: liuhongt <hongtao.liu@intel.com>
|
||||
Date: Fri, 4 Aug 2023 09:27:39 +0800
|
||||
Subject: [PATCH 10/32] Workaround possible CPUID bug in Sandy Bridge.
|
||||
|
||||
Don't access leaf 7 subleaf 1 unless subleaf 0 says it is
|
||||
supported via EAX.
|
||||
|
||||
Intel documentation says invalid subleaves return 0. We had been
|
||||
relying on that behavior instead of checking the max sublef number.
|
||||
|
||||
It appears that some Sandy Bridge CPUs return at least the subleaf 0
|
||||
EDX value for subleaf 1. Best guess is that this is a bug in a
|
||||
microcode patch since all of the bits we're seeing set in EDX were
|
||||
introduced after Sandy Bridge was originally released.
|
||||
|
||||
This is causing avxvnniint16 to be incorrectly enabled with
|
||||
-march=native on these CPUs.
|
||||
|
||||
gcc/ChangeLog:
|
||||
|
||||
* common/config/i386/cpuinfo.h (get_available_features): Check
|
||||
max_subleaf_level for valid subleaf before use CPUID.
|
||||
---
|
||||
gcc/common/config/i386/cpuinfo.h | 29 +++++++++++++++++------------
|
||||
1 file changed, 17 insertions(+), 12 deletions(-)
|
||||
|
||||
diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
|
||||
index 28b2ff0b0..316ad3cb3 100644
|
||||
--- a/gcc/common/config/i386/cpuinfo.h
|
||||
+++ b/gcc/common/config/i386/cpuinfo.h
|
||||
@@ -647,7 +647,9 @@ get_available_features (struct __processor_model *cpu_model,
|
||||
/* Get Advanced Features at level 7 (eax = 7, ecx = 0/1). */
|
||||
if (max_cpuid_level >= 7)
|
||||
{
|
||||
- __cpuid_count (7, 0, eax, ebx, ecx, edx);
|
||||
+ unsigned int max_subleaf_level;
|
||||
+
|
||||
+ __cpuid_count (7, 0, max_subleaf_level, ebx, ecx, edx);
|
||||
if (ebx & bit_BMI)
|
||||
set_feature (FEATURE_BMI);
|
||||
if (ebx & bit_SGX)
|
||||
@@ -759,18 +761,21 @@ get_available_features (struct __processor_model *cpu_model,
|
||||
set_feature (FEATURE_AVX512FP16);
|
||||
}
|
||||
|
||||
- __cpuid_count (7, 1, eax, ebx, ecx, edx);
|
||||
- if (eax & bit_HRESET)
|
||||
- set_feature (FEATURE_HRESET);
|
||||
- if (avx_usable)
|
||||
- {
|
||||
- if (eax & bit_AVXVNNI)
|
||||
- set_feature (FEATURE_AVXVNNI);
|
||||
- }
|
||||
- if (avx512_usable)
|
||||
+ if (max_subleaf_level >= 1)
|
||||
{
|
||||
- if (eax & bit_AVX512BF16)
|
||||
- set_feature (FEATURE_AVX512BF16);
|
||||
+ __cpuid_count (7, 1, eax, ebx, ecx, edx);
|
||||
+ if (eax & bit_HRESET)
|
||||
+ set_feature (FEATURE_HRESET);
|
||||
+ if (avx_usable)
|
||||
+ {
|
||||
+ if (eax & bit_AVXVNNI)
|
||||
+ set_feature (FEATURE_AVXVNNI);
|
||||
+ }
|
||||
+ if (avx512_usable)
|
||||
+ {
|
||||
+ if (eax & bit_AVX512BF16)
|
||||
+ set_feature (FEATURE_AVX512BF16);
|
||||
+ }
|
||||
}
|
||||
}
|
||||
|
||||
--
|
||||
2.28.0.windows.1
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user