[Sync] Sync patch from openeuler/gcc

This commit is contained in:
wangding16 2024-04-24 12:43:15 +08:00
parent e487f0b8ff
commit 25bccb60b0
8 changed files with 759 additions and 0 deletions

View File

@ -0,0 +1,135 @@
From 1649f9fbbc5267de2a675336d3ac665528a03db8 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 10 May 2023 15:16:58 +0800
Subject: [PATCH 03/32] x86: Add a new option -mdaz-ftz to enable FTZ and DAZ
flags in MXCSR.
if (mdaz-ftz)
link crtfastmath.o
else if ((Ofast || ffast-math || funsafe-math-optimizations)
&& !mno-daz-ftz)
link crtfastmath.o
else
Don't link crtfastmath.o
gcc/ChangeLog:
* config/i386/cygwin.h (ENDFILE_SPEC): Link crtfastmath.o
whenever -mdaz-ftz is specified. Don't link crtfastmath.o
when -mno-daz-ftz is specified.
* config/i386/darwin.h (ENDFILE_SPEC): Ditto.
* config/i386/gnu-user-common.h
(GNU_USER_TARGET_MATHFILE_SPEC): Ditto.
* config/i386/mingw32.h (ENDFILE_SPEC): Ditto.
* config/i386/i386.opt (mdaz-ftz): New option.
* doc/invoke.texi (x86 options): Document mftz-daz.
---
gcc/config/i386/cygwin.h | 2 +-
gcc/config/i386/darwin.h | 4 ++--
gcc/config/i386/gnu-user-common.h | 2 +-
gcc/config/i386/i386.opt | 4 ++++
gcc/config/i386/mingw32.h | 2 +-
gcc/doc/invoke.texi | 11 ++++++++++-
6 files changed, 19 insertions(+), 6 deletions(-)
diff --git a/gcc/config/i386/cygwin.h b/gcc/config/i386/cygwin.h
index d06eda369..5412c5d44 100644
--- a/gcc/config/i386/cygwin.h
+++ b/gcc/config/i386/cygwin.h
@@ -57,7 +57,7 @@ along with GCC; see the file COPYING3. If not see
#undef ENDFILE_SPEC
#define ENDFILE_SPEC \
- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}\
+ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \
%{!shared:%:if-exists(default-manifest.o%s)}\
%{fvtable-verify=none:%s; \
fvtable-verify=preinit:vtv_end.o%s; \
diff --git a/gcc/config/i386/darwin.h b/gcc/config/i386/darwin.h
index a55f6b2b8..2f773924d 100644
--- a/gcc/config/i386/darwin.h
+++ b/gcc/config/i386/darwin.h
@@ -109,8 +109,8 @@ along with GCC; see the file COPYING3. If not see
"%{!force_cpusubtype_ALL:-force_cpusubtype_ALL} "
#undef ENDFILE_SPEC
-#define ENDFILE_SPEC \
- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+#define ENDFILE_SPEC
+\ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \
%{mpc32:crtprec32.o%s} \
%{mpc64:crtprec64.o%s} \
%{mpc80:crtprec80.o%s}" TM_DESTRUCTOR
diff --git a/gcc/config/i386/gnu-user-common.h b/gcc/config/i386/gnu-user-common.h
index 23b54c5be..3d2a33f17 100644
--- a/gcc/config/i386/gnu-user-common.h
+++ b/gcc/config/i386/gnu-user-common.h
@@ -47,7 +47,7 @@ along with GCC; see the file COPYING3. If not see
/* Similar to standard GNU userspace, but adding -ffast-math support. */
#define GNU_USER_TARGET_MATHFILE_SPEC \
- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \
%{mpc32:crtprec32.o%s} \
%{mpc64:crtprec64.o%s} \
%{mpc80:crtprec80.o%s}"
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index fc1b944ac..498fb454d 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -420,6 +420,10 @@ mpc80
Target RejectNegative
Set 80387 floating-point precision to 80-bit.
+mdaz-ftz
+Target
+Set the FTZ and DAZ Flags.
+
mpreferred-stack-boundary=
Target RejectNegative Joined UInteger Var(ix86_preferred_stack_boundary_arg)
Attempt to keep stack aligned to this power of 2.
diff --git a/gcc/config/i386/mingw32.h b/gcc/config/i386/mingw32.h
index d3ca0cd02..ddbe6a405 100644
--- a/gcc/config/i386/mingw32.h
+++ b/gcc/config/i386/mingw32.h
@@ -197,7 +197,7 @@ along with GCC; see the file COPYING3. If not see
#undef ENDFILE_SPEC
#define ENDFILE_SPEC \
- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \
%{!shared:%:if-exists(default-manifest.o%s)}\
%{fvtable-verify=none:%s; \
fvtable-verify=preinit:vtv_end.o%s; \
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 2b376e0e9..3a48655e5 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1437,7 +1437,7 @@ See RS/6000 and PowerPC Options.
-m96bit-long-double -mlong-double-64 -mlong-double-80 -mlong-double-128 @gol
-mregparm=@var{num} -msseregparm @gol
-mveclibabi=@var{type} -mvect8-ret-in-mem @gol
--mpc32 -mpc64 -mpc80 -mstackrealign @gol
+-mpc32 -mpc64 -mpc80 -mdaz-ftz -mstackrealign @gol
-momit-leaf-frame-pointer -mno-red-zone -mno-tls-direct-seg-refs @gol
-mcmodel=@var{code-model} -mabi=@var{name} -maddress-mode=@var{mode} @gol
-m32 -m64 -mx32 -m16 -miamcu -mlarge-data-threshold=@var{num} @gol
@@ -32122,6 +32122,15 @@ are enabled by default; routines in such libraries could suffer significant
loss of accuracy, typically through so-called ``catastrophic cancellation'',
when this option is used to set the precision to less than extended precision.
+@item -mdaz-ftz
+@opindex mdaz-ftz
+
+The flush-to-zero (FTZ) and denormals-are-zero (DAZ) flags in the MXCSR register
+are used to control floating-point calculations.SSE and AVX instructions
+including scalar and vector instructions could benefit from enabling the FTZ
+and DAZ flags when @option{-mdaz-ftz} is specified. Don't set FTZ/DAZ flags
+when @option{-mno-daz-ftz} is specified.
+
@item -mstackrealign
@opindex mstackrealign
Realign the stack at entry. On the x86, the @option{-mstackrealign}
--
2.28.0.windows.1

View File

@ -0,0 +1,65 @@
From e70fa730dcfcb3a7b1d56a2e166752d4299f0504 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Mon, 5 Jun 2023 12:38:41 +0800
Subject: [PATCH 04/32] Explicitly view_convert_expr mask to signed type when
folding pblendvb builtins.
Since mask < 0 will be always false for vector char when
-funsigned-char, but vpblendvb needs to check the most significant
bit. The patch explicitly VCE to vector signed char.
gcc/ChangeLog:
PR target/110108
* config/i386/i386.cc (ix86_gimple_fold_builtin): Explicitly
view_convert_expr mask to signed type when folding pblendvb
builtins.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr110108-2.c: New test.
---
gcc/config/i386/i386.cc | 4 +++-
gcc/testsuite/gcc.target/i386/pr110108-2.c | 14 ++++++++++++++
2 files changed, 17 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr110108-2.c
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 462dce10e..479fc6010 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -18396,8 +18396,10 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
tree itype = GET_MODE_INNER (TYPE_MODE (type)) == E_SFmode
? intSI_type_node : intDI_type_node;
type = get_same_sized_vectype (itype, type);
- arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2);
}
+ else
+ type = signed_type_for (type);
+ arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2);
tree zero_vec = build_zero_cst (type);
tree cmp_type = truth_type_for (type);
tree cmp = gimple_build (&stmts, LT_EXPR, cmp_type, arg2, zero_vec);
diff --git a/gcc/testsuite/gcc.target/i386/pr110108-2.c b/gcc/testsuite/gcc.target/i386/pr110108-2.c
new file mode 100644
index 000000000..2d1d2fd49
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110108-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2 -funsigned-char" } */
+/* { dg-final { scan-assembler-times "vpblendvb" 2 } } */
+
+#include <immintrin.h>
+__m128i do_stuff_128(__m128i X0, __m128i X1, __m128i X2) {
+ __m128i Result = _mm_blendv_epi8(X0, X1, X2);
+ return Result;
+}
+
+__m256i do_stuff_256(__m256i X0, __m256i X1, __m256i X2) {
+ __m256i Result = _mm256_blendv_epi8(X0, X1, X2);
+ return Result;
+}
--
2.28.0.windows.1

View File

@ -0,0 +1,138 @@
From 48715f03ad08f185153bfb0ff4c0802ab2d9579c Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Mon, 26 Jun 2023 09:50:25 +0800
Subject: [PATCH 05/32] Make option mvzeroupper independent of optimization
level.
pass_insert_vzeroupper is under condition
TARGET_AVX && TARGET_VZEROUPPER
&& flag_expensive_optimizations && !optimize_size
But the document of mvzeroupper doesn't mention the insertion
required -O2 and above, it may confuse users when they explicitly
use -Os -mvzeroupper.
------------
mvzeroupper
Target Mask(VZEROUPPER) Save
Generate vzeroupper instruction before a transfer of control flow out of
the function.
------------
The patch moves flag_expensive_optimizations && !optimize_size to
ix86_option_override_internal. It makes -mvzeroupper independent of
optimization level, but still keeps the behavior of architecture
tuning(emit_vzeroupper) unchanged.
gcc/ChangeLog:
* config/i386/i386-features.cc (pass_insert_vzeroupper:gate):
Move flag_expensive_optimizations && !optimize_size to ..
* config/i386/i386-options.cc (ix86_option_override_internal):
.. this, it makes -mvzeroupper independent of optimization
level, but still keeps the behavior of architecture
tuning(emit_vzeroupper) unchanged.
gcc/testsuite/ChangeLog:
* gcc.target/i386/avx-vzeroupper-29.c: New testcase.
* gcc.target/i386/avx-vzeroupper-12.c: Adjust testcase.
* gcc.target/i386/avx-vzeroupper-7.c: Ditto.
* gcc.target/i386/avx-vzeroupper-9.c: Ditto.
---
gcc/config/i386/i386-features.cc | 3 +--
gcc/config/i386/i386-options.cc | 4 +++-
gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c | 3 ++-
gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c | 14 ++++++++++++++
gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c | 3 ++-
gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c | 3 ++-
6 files changed, 24 insertions(+), 6 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 6fe41c3c2..6a2444eb6 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -1875,8 +1875,7 @@ public:
/* opt_pass methods: */
virtual bool gate (function *)
{
- return TARGET_AVX && TARGET_VZEROUPPER
- && flag_expensive_optimizations && !optimize_size;
+ return TARGET_AVX && TARGET_VZEROUPPER;
}
virtual unsigned int execute (function *)
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index ff44ad4e0..74e969b68 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2702,7 +2702,9 @@ ix86_option_override_internal (bool main_args_p,
sorry ("%<-mcall-ms2sysv-xlogues%> isn%'t currently supported with SEH");
if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
- && TARGET_EMIT_VZEROUPPER)
+ && TARGET_EMIT_VZEROUPPER
+ && flag_expensive_optimizations
+ && !optimize_size)
opts->x_target_flags |= MASK_VZEROUPPER;
if (!(opts_set->x_target_flags & MASK_STV))
opts->x_target_flags |= MASK_STV;
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c
index e694d4048..5a40e8783 100644
--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c
@@ -16,5 +16,6 @@ foo ()
_mm256_zeroupper ();
}
-/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 } } */
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 5 { target { ! ia32 } } } } */
/* { dg-final { scan-assembler-times "\\*avx_vzeroall" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c
new file mode 100644
index 000000000..4af637757
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mtune=generic -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+ x = y;
+}
+
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c
index ab6d68779..75fe58897 100644
--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c
@@ -12,4 +12,5 @@ foo ()
_mm256_zeroupper ();
}
-/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 2 { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c
index 974e1626a..fa0a6dfca 100644
--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c
@@ -15,4 +15,5 @@ foo ()
_mm256_zeroupper ();
}
-/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 } } */
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 5 { target { ! ia32 } } } } */
--
2.28.0.windows.1

View File

@ -0,0 +1,68 @@
From 8039d773354360ed8ff2f25c63843fc637eacc67 Mon Sep 17 00:00:00 2001
From: Hongyu Wang <hongyu.wang@intel.com>
Date: Sun, 25 Jun 2023 09:50:21 +0800
Subject: [PATCH 06/32] i386: Sync tune_string with arch_string for target
attribute
arch=*
For function with target attribute arch=*, current logic will set its
tune to -mtune from command line so all target_clones will get same
tuning flags which would affect the performance for each clone. Override
tune with arch if tune was not explicitly specified to get proper tuning
flags for target_clones.
gcc/ChangeLog:
* config/i386/i386-options.cc (ix86_valid_target_attribute_tree):
Override tune_string with arch_string if tune_string is not
explicitly specified.
gcc/testsuite/ChangeLog:
* gcc.target/i386/mvc17.c: New test.
(cherry picked from commit 2916278d14e9ac28c361c396a67256acbebda6e8)
---
gcc/config/i386/i386-options.cc | 6 +++++-
gcc/testsuite/gcc.target/i386/mvc17.c | 11 +++++++++++
2 files changed, 16 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.target/i386/mvc17.c
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 74e969b68..fb2ed942f 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -1378,7 +1378,11 @@ ix86_valid_target_attribute_tree (tree fndecl, tree args,
if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
opts->x_ix86_tune_string
= ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
- else if (orig_tune_defaulted)
+ /* If we have explicit arch string and no tune string specified, set
+ tune_string to NULL and later it will be overriden by arch_string
+ so target clones can get proper optimization. */
+ else if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
+ || orig_tune_defaulted)
opts->x_ix86_tune_string = NULL;
/* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
diff --git a/gcc/testsuite/gcc.target/i386/mvc17.c b/gcc/testsuite/gcc.target/i386/mvc17.c
new file mode 100644
index 000000000..8b83c1aec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/mvc17.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-require-ifunc "" } */
+/* { dg-options "-O2 -march=x86-64" } */
+/* { dg-final { scan-assembler-times "rep mov" 1 } } */
+
+__attribute__((target_clones("default","arch=icelake-server")))
+void
+foo (char *a, char *b, int size)
+{
+ __builtin_memcpy (a, b, size & 0x7F);
+}
--
2.28.0.windows.1

View File

@ -0,0 +1,111 @@
From fbcb1a5899b1bd3964aed78ed74041121e618d36 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Tue, 20 Jun 2023 15:41:00 +0800
Subject: [PATCH 07/32] Refine maskloadmn pattern with UNSPEC_MASKLOAD.
If mem_addr points to a memory region with less than whole vector size
bytes of accessible memory and k is a mask that would prevent reading
the inaccessible bytes from mem_addr, add UNSPEC_MASKLOAD to prevent
it to be transformed to vpblendd.
gcc/ChangeLog:
PR target/110309
* config/i386/sse.md (maskload<mode><avx512fmaskmodelower>):
Refine pattern with UNSPEC_MASKLOAD.
(maskload<mode><avx512fmaskmodelower>): Ditto.
(*<avx512>_load<mode>_mask): Extend mode iterator to
VI12HF_AVX512VL.
(*<avx512>_load<mode>): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr110309.c: New test.
---
gcc/config/i386/sse.md | 32 +++++++++++++-----------
gcc/testsuite/gcc.target/i386/pr110309.c | 10 ++++++++
2 files changed, 28 insertions(+), 14 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr110309.c
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index eb767e56c..b30e96cb1 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1411,12 +1411,12 @@
})
(define_insn "*<avx512>_load<mode>_mask"
- [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
- (vec_merge:VI12_AVX512VL
- (unspec:VI12_AVX512VL
- [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")]
+ [(set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v")
+ (vec_merge:VI12HF_AVX512VL
+ (unspec:VI12HF_AVX512VL
+ [(match_operand:VI12HF_AVX512VL 1 "memory_operand" "m")]
UNSPEC_MASKLOAD)
- (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "0C")
+ (match_operand:VI12HF_AVX512VL 2 "nonimm_or_0_operand" "0C")
(match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))]
"TARGET_AVX512BW"
"vmovdqu<ssescalarsize>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
@@ -1425,9 +1425,9 @@
(set_attr "mode" "<sseinsnmode>")])
(define_insn_and_split "*<avx512>_load<mode>"
- [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
- (unspec:VI12_AVX512VL
- [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")]
+ [(set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v")
+ (unspec:VI12HF_AVX512VL
+ [(match_operand:VI12HF_AVX512VL 1 "memory_operand" "m")]
UNSPEC_MASKLOAD))]
"TARGET_AVX512BW"
"#"
@@ -25973,17 +25973,21 @@
"TARGET_AVX")
(define_expand "maskload<mode><avx512fmaskmodelower>"
- [(set (match_operand:V48H_AVX512VL 0 "register_operand")
- (vec_merge:V48H_AVX512VL
- (match_operand:V48H_AVX512VL 1 "memory_operand")
+ [(set (match_operand:V48_AVX512VL 0 "register_operand")
+ (vec_merge:V48_AVX512VL
+ (unspec:V48_AVX512VL
+ [(match_operand:V48_AVX512VL 1 "memory_operand")]
+ UNSPEC_MASKLOAD)
(match_dup 0)
(match_operand:<avx512fmaskmode> 2 "register_operand")))]
"TARGET_AVX512F")
(define_expand "maskload<mode><avx512fmaskmodelower>"
- [(set (match_operand:VI12_AVX512VL 0 "register_operand")
- (vec_merge:VI12_AVX512VL
- (match_operand:VI12_AVX512VL 1 "memory_operand")
+ [(set (match_operand:VI12HF_AVX512VL 0 "register_operand")
+ (vec_merge:VI12HF_AVX512VL
+ (unspec:VI12HF_AVX512VL
+ [(match_operand:VI12HF_AVX512VL 1 "memory_operand")]
+ UNSPEC_MASKLOAD)
(match_dup 0)
(match_operand:<avx512fmaskmode> 2 "register_operand")))]
"TARGET_AVX512BW")
diff --git a/gcc/testsuite/gcc.target/i386/pr110309.c b/gcc/testsuite/gcc.target/i386/pr110309.c
new file mode 100644
index 000000000..f6e9e9c3c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110309.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 --param vect-partial-vector-usage=1 -march=znver4 -mprefer-vector-width=256" } */
+/* { dg-final { scan-assembler-not {(?n)vpblendd.*ymm} } } */
+
+
+void foo (int * __restrict a, int *b)
+{
+ for (int i = 0; i < 6; ++i)
+ a[i] = b[i] + 42;
+}
--
2.28.0.windows.1

View File

@ -0,0 +1,126 @@
From 5ad28ef4010c1248b4d94396d03f863705f7b0db Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Mon, 26 Jun 2023 21:07:09 +0800
Subject: [PATCH 08/32] Refine maskstore patterns with UNSPEC_MASKMOV.
Similar like r14-2070-gc79476da46728e
If mem_addr points to a memory region with less than whole vector size
bytes of accessible memory and k is a mask that would prevent reading
the inaccessible bytes from mem_addr, add UNSPEC_MASKMOV to prevent
it to be transformed to any other whole memory access instructions.
gcc/ChangeLog:
PR rtl-optimization/110237
* config/i386/sse.md (<avx512>_store<mode>_mask): Refine with
UNSPEC_MASKMOV.
(maskstore<mode><avx512fmaskmodelower): Ditto.
(*<avx512>_store<mode>_mask): New define_insn, it's renamed
from original <avx512>_store<mode>_mask.
---
gcc/config/i386/sse.md | 69 ++++++++++++++++++++++++++++++++++--------
1 file changed, 57 insertions(+), 12 deletions(-)
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index b30e96cb1..3af159896 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1554,7 +1554,7 @@
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
-(define_insn "<avx512>_store<mode>_mask"
+(define_insn "*<avx512>_store<mode>_mask"
[(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m")
(vec_merge:V48_AVX512VL
(match_operand:V48_AVX512VL 1 "register_operand" "v")
@@ -1582,7 +1582,7 @@
(set_attr "memory" "store")
(set_attr "mode" "<sseinsnmode>")])
-(define_insn "<avx512>_store<mode>_mask"
+(define_insn "*<avx512>_store<mode>_mask"
[(set (match_operand:VI12HF_AVX512VL 0 "memory_operand" "=m")
(vec_merge:VI12HF_AVX512VL
(match_operand:VI12HF_AVX512VL 1 "register_operand" "v")
@@ -26002,21 +26002,66 @@
"TARGET_AVX")
(define_expand "maskstore<mode><avx512fmaskmodelower>"
- [(set (match_operand:V48H_AVX512VL 0 "memory_operand")
- (vec_merge:V48H_AVX512VL
- (match_operand:V48H_AVX512VL 1 "register_operand")
- (match_dup 0)
- (match_operand:<avx512fmaskmode> 2 "register_operand")))]
+ [(set (match_operand:V48_AVX512VL 0 "memory_operand")
+ (unspec:V48_AVX512VL
+ [(match_operand:V48_AVX512VL 1 "register_operand")
+ (match_dup 0)
+ (match_operand:<avx512fmaskmode> 2 "register_operand")]
+ UNSPEC_MASKMOV))]
"TARGET_AVX512F")
(define_expand "maskstore<mode><avx512fmaskmodelower>"
- [(set (match_operand:VI12_AVX512VL 0 "memory_operand")
- (vec_merge:VI12_AVX512VL
- (match_operand:VI12_AVX512VL 1 "register_operand")
- (match_dup 0)
- (match_operand:<avx512fmaskmode> 2 "register_operand")))]
+ [(set (match_operand:VI12HF_AVX512VL 0 "memory_operand")
+ (unspec:VI12HF_AVX512VL
+ [(match_operand:VI12HF_AVX512VL 1 "register_operand")
+ (match_dup 0)
+ (match_operand:<avx512fmaskmode> 2 "register_operand")]
+ UNSPEC_MASKMOV))]
"TARGET_AVX512BW")
+(define_insn "<avx512>_store<mode>_mask"
+ [(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m")
+ (unspec:V48_AVX512VL
+ [(match_operand:V48_AVX512VL 1 "register_operand" "v")
+ (match_dup 0)
+ (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")]
+ UNSPEC_MASKMOV))]
+ "TARGET_AVX512F"
+{
+ if (FLOAT_MODE_P (GET_MODE_INNER (<MODE>mode)))
+ {
+ if (misaligned_operand (operands[0], <MODE>mode))
+ return "vmovu<ssemodesuffix>\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+ else
+ return "vmova<ssemodesuffix>\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+ }
+ else
+ {
+ if (misaligned_operand (operands[0], <MODE>mode))
+ return "vmovdqu<ssescalarsize>\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+ else
+ return "vmovdqa<ssescalarsize>\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+ }
+}
+ [(set_attr "type" "ssemov")
+ (set_attr "prefix" "evex")
+ (set_attr "memory" "store")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<avx512>_store<mode>_mask"
+ [(set (match_operand:VI12HF_AVX512VL 0 "memory_operand" "=m")
+ (unspec:VI12HF_AVX512VL
+ [(match_operand:VI12HF_AVX512VL 1 "register_operand" "v")
+ (match_dup 0)
+ (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")]
+ UNSPEC_MASKMOV))]
+ "TARGET_AVX512BW"
+ "vmovdqu<ssescalarsize>\t{%1, %0%{%2%}|%0%{%2%}, %1}"
+ [(set_attr "type" "ssemov")
+ (set_attr "prefix" "evex")
+ (set_attr "memory" "store")
+ (set_attr "mode" "<sseinsnmode>")])
+
(define_expand "cbranch<mode>4"
[(set (reg:CC FLAGS_REG)
(compare:CC (match_operand:VI48_AVX 1 "register_operand")
--
2.28.0.windows.1

View File

@ -0,0 +1,38 @@
From 50757adc93ef32a97a8a1083f5d53a9c00da6ac8 Mon Sep 17 00:00:00 2001
From: "Cui, Lili" <lili.cui@intel.com>
Date: Thu, 29 Jun 2023 03:10:35 +0000
Subject: [PATCH 09/32] x86: Update model values for Alderlake and Rocketlake.
Update model values for Alderlake and Rocketlake according to SDM.
gcc/ChangeLog
* common/config/i386/cpuinfo.h (get_intel_cpu): Remove model value 0xa8
from Rocketlake, remove model value 0xbf from Alderlake.
---
gcc/common/config/i386/cpuinfo.h | 2 --
1 file changed, 2 deletions(-)
diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 0333da56b..28b2ff0b0 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -435,7 +435,6 @@ get_intel_cpu (struct __processor_model *cpu_model,
cpu_model->__cpu_subtype = INTEL_COREI7_SKYLAKE;
break;
case 0xa7:
- case 0xa8:
/* Rocket Lake. */
cpu = "rocketlake";
CHECK___builtin_cpu_is ("corei7");
@@ -508,7 +507,6 @@ get_intel_cpu (struct __processor_model *cpu_model,
break;
case 0x97:
case 0x9a:
- case 0xbf:
/* Alder Lake. */
cpu = "alderlake";
CHECK___builtin_cpu_is ("corei7");
--
2.28.0.windows.1

View File

@ -0,0 +1,78 @@
From 60364b439a80c217174e1830e0b7507d6f4538c4 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Fri, 4 Aug 2023 09:27:39 +0800
Subject: [PATCH 10/32] Workaround possible CPUID bug in Sandy Bridge.
Don't access leaf 7 subleaf 1 unless subleaf 0 says it is
supported via EAX.
Intel documentation says invalid subleaves return 0. We had been
relying on that behavior instead of checking the max sublef number.
It appears that some Sandy Bridge CPUs return at least the subleaf 0
EDX value for subleaf 1. Best guess is that this is a bug in a
microcode patch since all of the bits we're seeing set in EDX were
introduced after Sandy Bridge was originally released.
This is causing avxvnniint16 to be incorrectly enabled with
-march=native on these CPUs.
gcc/ChangeLog:
* common/config/i386/cpuinfo.h (get_available_features): Check
max_subleaf_level for valid subleaf before use CPUID.
---
gcc/common/config/i386/cpuinfo.h | 29 +++++++++++++++++------------
1 file changed, 17 insertions(+), 12 deletions(-)
diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 28b2ff0b0..316ad3cb3 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -647,7 +647,9 @@ get_available_features (struct __processor_model *cpu_model,
/* Get Advanced Features at level 7 (eax = 7, ecx = 0/1). */
if (max_cpuid_level >= 7)
{
- __cpuid_count (7, 0, eax, ebx, ecx, edx);
+ unsigned int max_subleaf_level;
+
+ __cpuid_count (7, 0, max_subleaf_level, ebx, ecx, edx);
if (ebx & bit_BMI)
set_feature (FEATURE_BMI);
if (ebx & bit_SGX)
@@ -759,18 +761,21 @@ get_available_features (struct __processor_model *cpu_model,
set_feature (FEATURE_AVX512FP16);
}
- __cpuid_count (7, 1, eax, ebx, ecx, edx);
- if (eax & bit_HRESET)
- set_feature (FEATURE_HRESET);
- if (avx_usable)
- {
- if (eax & bit_AVXVNNI)
- set_feature (FEATURE_AVXVNNI);
- }
- if (avx512_usable)
+ if (max_subleaf_level >= 1)
{
- if (eax & bit_AVX512BF16)
- set_feature (FEATURE_AVX512BF16);
+ __cpuid_count (7, 1, eax, ebx, ecx, edx);
+ if (eax & bit_HRESET)
+ set_feature (FEATURE_HRESET);
+ if (avx_usable)
+ {
+ if (eax & bit_AVXVNNI)
+ set_feature (FEATURE_AVXVNNI);
+ }
+ if (avx512_usable)
+ {
+ if (eax & bit_AVX512BF16)
+ set_feature (FEATURE_AVX512BF16);
+ }
}
}
--
2.28.0.windows.1