diff --git a/0058-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ-.patch b/0058-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ-.patch new file mode 100644 index 0000000..94c7d92 --- /dev/null +++ b/0058-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ-.patch @@ -0,0 +1,135 @@ +From 1649f9fbbc5267de2a675336d3ac665528a03db8 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Wed, 10 May 2023 15:16:58 +0800 +Subject: [PATCH 03/32] x86: Add a new option -mdaz-ftz to enable FTZ and DAZ + flags in MXCSR. + + if (mdaz-ftz) + link crtfastmath.o + else if ((Ofast || ffast-math || funsafe-math-optimizations) + && !mno-daz-ftz) + link crtfastmath.o + else + Don't link crtfastmath.o + +gcc/ChangeLog: + + * config/i386/cygwin.h (ENDFILE_SPEC): Link crtfastmath.o + whenever -mdaz-ftz is specified. Don't link crtfastmath.o + when -mno-daz-ftz is specified. + * config/i386/darwin.h (ENDFILE_SPEC): Ditto. + * config/i386/gnu-user-common.h + (GNU_USER_TARGET_MATHFILE_SPEC): Ditto. + * config/i386/mingw32.h (ENDFILE_SPEC): Ditto. + * config/i386/i386.opt (mdaz-ftz): New option. + * doc/invoke.texi (x86 options): Document mftz-daz. +--- + gcc/config/i386/cygwin.h | 2 +- + gcc/config/i386/darwin.h | 4 ++-- + gcc/config/i386/gnu-user-common.h | 2 +- + gcc/config/i386/i386.opt | 4 ++++ + gcc/config/i386/mingw32.h | 2 +- + gcc/doc/invoke.texi | 11 ++++++++++- + 6 files changed, 19 insertions(+), 6 deletions(-) + +diff --git a/gcc/config/i386/cygwin.h b/gcc/config/i386/cygwin.h +index d06eda369..5412c5d44 100644 +--- a/gcc/config/i386/cygwin.h ++++ b/gcc/config/i386/cygwin.h +@@ -57,7 +57,7 @@ along with GCC; see the file COPYING3. If not see + + #undef ENDFILE_SPEC + #define ENDFILE_SPEC \ +- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}\ ++ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \ + %{!shared:%:if-exists(default-manifest.o%s)}\ + %{fvtable-verify=none:%s; \ + fvtable-verify=preinit:vtv_end.o%s; \ +diff --git a/gcc/config/i386/darwin.h b/gcc/config/i386/darwin.h +index a55f6b2b8..2f773924d 100644 +--- a/gcc/config/i386/darwin.h ++++ b/gcc/config/i386/darwin.h +@@ -109,8 +109,8 @@ along with GCC; see the file COPYING3. If not see + "%{!force_cpusubtype_ALL:-force_cpusubtype_ALL} " + + #undef ENDFILE_SPEC +-#define ENDFILE_SPEC \ +- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \ ++#define ENDFILE_SPEC ++\ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \ + %{mpc32:crtprec32.o%s} \ + %{mpc64:crtprec64.o%s} \ + %{mpc80:crtprec80.o%s}" TM_DESTRUCTOR +diff --git a/gcc/config/i386/gnu-user-common.h b/gcc/config/i386/gnu-user-common.h +index 23b54c5be..3d2a33f17 100644 +--- a/gcc/config/i386/gnu-user-common.h ++++ b/gcc/config/i386/gnu-user-common.h +@@ -47,7 +47,7 @@ along with GCC; see the file COPYING3. If not see + + /* Similar to standard GNU userspace, but adding -ffast-math support. */ + #define GNU_USER_TARGET_MATHFILE_SPEC \ +- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \ ++ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \ + %{mpc32:crtprec32.o%s} \ + %{mpc64:crtprec64.o%s} \ + %{mpc80:crtprec80.o%s}" +diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt +index fc1b944ac..498fb454d 100644 +--- a/gcc/config/i386/i386.opt ++++ b/gcc/config/i386/i386.opt +@@ -420,6 +420,10 @@ mpc80 + Target RejectNegative + Set 80387 floating-point precision to 80-bit. + ++mdaz-ftz ++Target ++Set the FTZ and DAZ Flags. ++ + mpreferred-stack-boundary= + Target RejectNegative Joined UInteger Var(ix86_preferred_stack_boundary_arg) + Attempt to keep stack aligned to this power of 2. +diff --git a/gcc/config/i386/mingw32.h b/gcc/config/i386/mingw32.h +index d3ca0cd02..ddbe6a405 100644 +--- a/gcc/config/i386/mingw32.h ++++ b/gcc/config/i386/mingw32.h +@@ -197,7 +197,7 @@ along with GCC; see the file COPYING3. If not see + + #undef ENDFILE_SPEC + #define ENDFILE_SPEC \ +- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \ ++ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \ + %{!shared:%:if-exists(default-manifest.o%s)}\ + %{fvtable-verify=none:%s; \ + fvtable-verify=preinit:vtv_end.o%s; \ +diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi +index 2b376e0e9..3a48655e5 100644 +--- a/gcc/doc/invoke.texi ++++ b/gcc/doc/invoke.texi +@@ -1437,7 +1437,7 @@ See RS/6000 and PowerPC Options. + -m96bit-long-double -mlong-double-64 -mlong-double-80 -mlong-double-128 @gol + -mregparm=@var{num} -msseregparm @gol + -mveclibabi=@var{type} -mvect8-ret-in-mem @gol +--mpc32 -mpc64 -mpc80 -mstackrealign @gol ++-mpc32 -mpc64 -mpc80 -mdaz-ftz -mstackrealign @gol + -momit-leaf-frame-pointer -mno-red-zone -mno-tls-direct-seg-refs @gol + -mcmodel=@var{code-model} -mabi=@var{name} -maddress-mode=@var{mode} @gol + -m32 -m64 -mx32 -m16 -miamcu -mlarge-data-threshold=@var{num} @gol +@@ -32122,6 +32122,15 @@ are enabled by default; routines in such libraries could suffer significant + loss of accuracy, typically through so-called ``catastrophic cancellation'', + when this option is used to set the precision to less than extended precision. + ++@item -mdaz-ftz ++@opindex mdaz-ftz ++ ++The flush-to-zero (FTZ) and denormals-are-zero (DAZ) flags in the MXCSR register ++are used to control floating-point calculations.SSE and AVX instructions ++including scalar and vector instructions could benefit from enabling the FTZ ++and DAZ flags when @option{-mdaz-ftz} is specified. Don't set FTZ/DAZ flags ++when @option{-mno-daz-ftz} is specified. ++ + @item -mstackrealign + @opindex mstackrealign + Realign the stack at entry. On the x86, the @option{-mstackrealign} +-- +2.28.0.windows.1 + diff --git a/0059-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch b/0059-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch new file mode 100644 index 0000000..d29377e --- /dev/null +++ b/0059-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch @@ -0,0 +1,65 @@ +From e70fa730dcfcb3a7b1d56a2e166752d4299f0504 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Mon, 5 Jun 2023 12:38:41 +0800 +Subject: [PATCH 04/32] Explicitly view_convert_expr mask to signed type when + folding pblendvb builtins. + +Since mask < 0 will be always false for vector char when +-funsigned-char, but vpblendvb needs to check the most significant +bit. The patch explicitly VCE to vector signed char. + +gcc/ChangeLog: + + PR target/110108 + * config/i386/i386.cc (ix86_gimple_fold_builtin): Explicitly + view_convert_expr mask to signed type when folding pblendvb + builtins. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr110108-2.c: New test. +--- + gcc/config/i386/i386.cc | 4 +++- + gcc/testsuite/gcc.target/i386/pr110108-2.c | 14 ++++++++++++++ + 2 files changed, 17 insertions(+), 1 deletion(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr110108-2.c + +diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc +index 462dce10e..479fc6010 100644 +--- a/gcc/config/i386/i386.cc ++++ b/gcc/config/i386/i386.cc +@@ -18396,8 +18396,10 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) + tree itype = GET_MODE_INNER (TYPE_MODE (type)) == E_SFmode + ? intSI_type_node : intDI_type_node; + type = get_same_sized_vectype (itype, type); +- arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2); + } ++ else ++ type = signed_type_for (type); ++ arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2); + tree zero_vec = build_zero_cst (type); + tree cmp_type = truth_type_for (type); + tree cmp = gimple_build (&stmts, LT_EXPR, cmp_type, arg2, zero_vec); +diff --git a/gcc/testsuite/gcc.target/i386/pr110108-2.c b/gcc/testsuite/gcc.target/i386/pr110108-2.c +new file mode 100644 +index 000000000..2d1d2fd49 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr110108-2.c +@@ -0,0 +1,14 @@ ++/* { dg-do compile } */ ++/* { dg-options "-mavx2 -O2 -funsigned-char" } */ ++/* { dg-final { scan-assembler-times "vpblendvb" 2 } } */ ++ ++#include ++__m128i do_stuff_128(__m128i X0, __m128i X1, __m128i X2) { ++ __m128i Result = _mm_blendv_epi8(X0, X1, X2); ++ return Result; ++} ++ ++__m256i do_stuff_256(__m256i X0, __m256i X1, __m256i X2) { ++ __m256i Result = _mm256_blendv_epi8(X0, X1, X2); ++ return Result; ++} +-- +2.28.0.windows.1 + diff --git a/0060-Make-option-mvzeroupper-independent-of-optimization-.patch b/0060-Make-option-mvzeroupper-independent-of-optimization-.patch new file mode 100644 index 0000000..c15cff7 --- /dev/null +++ b/0060-Make-option-mvzeroupper-independent-of-optimization-.patch @@ -0,0 +1,138 @@ +From 48715f03ad08f185153bfb0ff4c0802ab2d9579c Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Mon, 26 Jun 2023 09:50:25 +0800 +Subject: [PATCH 05/32] Make option mvzeroupper independent of optimization + level. + +pass_insert_vzeroupper is under condition + +TARGET_AVX && TARGET_VZEROUPPER +&& flag_expensive_optimizations && !optimize_size + +But the document of mvzeroupper doesn't mention the insertion +required -O2 and above, it may confuse users when they explicitly +use -Os -mvzeroupper. + +------------ +mvzeroupper +Target Mask(VZEROUPPER) Save +Generate vzeroupper instruction before a transfer of control flow out of +the function. +------------ + +The patch moves flag_expensive_optimizations && !optimize_size to +ix86_option_override_internal. It makes -mvzeroupper independent of +optimization level, but still keeps the behavior of architecture +tuning(emit_vzeroupper) unchanged. + +gcc/ChangeLog: + + * config/i386/i386-features.cc (pass_insert_vzeroupper:gate): + Move flag_expensive_optimizations && !optimize_size to .. + * config/i386/i386-options.cc (ix86_option_override_internal): + .. this, it makes -mvzeroupper independent of optimization + level, but still keeps the behavior of architecture + tuning(emit_vzeroupper) unchanged. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/avx-vzeroupper-29.c: New testcase. + * gcc.target/i386/avx-vzeroupper-12.c: Adjust testcase. + * gcc.target/i386/avx-vzeroupper-7.c: Ditto. + * gcc.target/i386/avx-vzeroupper-9.c: Ditto. +--- + gcc/config/i386/i386-features.cc | 3 +-- + gcc/config/i386/i386-options.cc | 4 +++- + gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c | 3 ++- + gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c | 14 ++++++++++++++ + gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c | 3 ++- + gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c | 3 ++- + 6 files changed, 24 insertions(+), 6 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c + +diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc +index 6fe41c3c2..6a2444eb6 100644 +--- a/gcc/config/i386/i386-features.cc ++++ b/gcc/config/i386/i386-features.cc +@@ -1875,8 +1875,7 @@ public: + /* opt_pass methods: */ + virtual bool gate (function *) + { +- return TARGET_AVX && TARGET_VZEROUPPER +- && flag_expensive_optimizations && !optimize_size; ++ return TARGET_AVX && TARGET_VZEROUPPER; + } + + virtual unsigned int execute (function *) +diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc +index ff44ad4e0..74e969b68 100644 +--- a/gcc/config/i386/i386-options.cc ++++ b/gcc/config/i386/i386-options.cc +@@ -2702,7 +2702,9 @@ ix86_option_override_internal (bool main_args_p, + sorry ("%<-mcall-ms2sysv-xlogues%> isn%'t currently supported with SEH"); + + if (!(opts_set->x_target_flags & MASK_VZEROUPPER) +- && TARGET_EMIT_VZEROUPPER) ++ && TARGET_EMIT_VZEROUPPER ++ && flag_expensive_optimizations ++ && !optimize_size) + opts->x_target_flags |= MASK_VZEROUPPER; + if (!(opts_set->x_target_flags & MASK_STV)) + opts->x_target_flags |= MASK_STV; +diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c +index e694d4048..5a40e8783 100644 +--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c ++++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c +@@ -16,5 +16,6 @@ foo () + _mm256_zeroupper (); + } + +-/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 } } */ ++/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 { target ia32 } } } */ ++/* { dg-final { scan-assembler-times "avx_vzeroupper" 5 { target { ! ia32 } } } } */ + /* { dg-final { scan-assembler-times "\\*avx_vzeroall" 1 } } */ +diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c +new file mode 100644 +index 000000000..4af637757 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c +@@ -0,0 +1,14 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O0 -mavx -mtune=generic -mvzeroupper -dp" } */ ++ ++#include ++ ++extern __m256 x, y; ++ ++void ++foo () ++{ ++ x = y; ++} ++ ++/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */ +diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c +index ab6d68779..75fe58897 100644 +--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c ++++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c +@@ -12,4 +12,5 @@ foo () + _mm256_zeroupper (); + } + +-/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */ ++/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 { target ia32 } } } */ ++/* { dg-final { scan-assembler-times "avx_vzeroupper" 2 { target { ! ia32 } } } } */ +diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c +index 974e1626a..fa0a6dfca 100644 +--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c ++++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c +@@ -15,4 +15,5 @@ foo () + _mm256_zeroupper (); + } + +-/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 } } */ ++/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 { target ia32 } } } */ ++/* { dg-final { scan-assembler-times "avx_vzeroupper" 5 { target { ! ia32 } } } } */ +-- +2.28.0.windows.1 + diff --git a/0061-i386-Sync-tune_string-with-arch_string-for-target-at.patch b/0061-i386-Sync-tune_string-with-arch_string-for-target-at.patch new file mode 100644 index 0000000..09c43a5 --- /dev/null +++ b/0061-i386-Sync-tune_string-with-arch_string-for-target-at.patch @@ -0,0 +1,68 @@ +From 8039d773354360ed8ff2f25c63843fc637eacc67 Mon Sep 17 00:00:00 2001 +From: Hongyu Wang +Date: Sun, 25 Jun 2023 09:50:21 +0800 +Subject: [PATCH 06/32] i386: Sync tune_string with arch_string for target + attribute + +arch=* + +For function with target attribute arch=*, current logic will set its +tune to -mtune from command line so all target_clones will get same +tuning flags which would affect the performance for each clone. Override +tune with arch if tune was not explicitly specified to get proper tuning +flags for target_clones. + +gcc/ChangeLog: + + * config/i386/i386-options.cc (ix86_valid_target_attribute_tree): + Override tune_string with arch_string if tune_string is not + explicitly specified. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/mvc17.c: New test. + +(cherry picked from commit 2916278d14e9ac28c361c396a67256acbebda6e8) +--- + gcc/config/i386/i386-options.cc | 6 +++++- + gcc/testsuite/gcc.target/i386/mvc17.c | 11 +++++++++++ + 2 files changed, 16 insertions(+), 1 deletion(-) + create mode 100644 gcc/testsuite/gcc.target/i386/mvc17.c + +diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc +index 74e969b68..fb2ed942f 100644 +--- a/gcc/config/i386/i386-options.cc ++++ b/gcc/config/i386/i386-options.cc +@@ -1378,7 +1378,11 @@ ix86_valid_target_attribute_tree (tree fndecl, tree args, + if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]) + opts->x_ix86_tune_string + = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]); +- else if (orig_tune_defaulted) ++ /* If we have explicit arch string and no tune string specified, set ++ tune_string to NULL and later it will be overriden by arch_string ++ so target clones can get proper optimization. */ ++ else if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH] ++ || orig_tune_defaulted) + opts->x_ix86_tune_string = NULL; + + /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */ +diff --git a/gcc/testsuite/gcc.target/i386/mvc17.c b/gcc/testsuite/gcc.target/i386/mvc17.c +new file mode 100644 +index 000000000..8b83c1aec +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/mvc17.c +@@ -0,0 +1,11 @@ ++/* { dg-do compile } */ ++/* { dg-require-ifunc "" } */ ++/* { dg-options "-O2 -march=x86-64" } */ ++/* { dg-final { scan-assembler-times "rep mov" 1 } } */ ++ ++__attribute__((target_clones("default","arch=icelake-server"))) ++void ++foo (char *a, char *b, int size) ++{ ++ __builtin_memcpy (a, b, size & 0x7F); ++} +-- +2.28.0.windows.1 + diff --git a/0062-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch b/0062-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch new file mode 100644 index 0000000..972017b --- /dev/null +++ b/0062-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch @@ -0,0 +1,111 @@ +From fbcb1a5899b1bd3964aed78ed74041121e618d36 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Tue, 20 Jun 2023 15:41:00 +0800 +Subject: [PATCH 07/32] Refine maskloadmn pattern with UNSPEC_MASKLOAD. + +If mem_addr points to a memory region with less than whole vector size +bytes of accessible memory and k is a mask that would prevent reading +the inaccessible bytes from mem_addr, add UNSPEC_MASKLOAD to prevent +it to be transformed to vpblendd. + +gcc/ChangeLog: + + PR target/110309 + * config/i386/sse.md (maskload): + Refine pattern with UNSPEC_MASKLOAD. + (maskload): Ditto. + (*_load_mask): Extend mode iterator to + VI12HF_AVX512VL. + (*_load): Ditto. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr110309.c: New test. +--- + gcc/config/i386/sse.md | 32 +++++++++++++----------- + gcc/testsuite/gcc.target/i386/pr110309.c | 10 ++++++++ + 2 files changed, 28 insertions(+), 14 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr110309.c + +diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md +index eb767e56c..b30e96cb1 100644 +--- a/gcc/config/i386/sse.md ++++ b/gcc/config/i386/sse.md +@@ -1411,12 +1411,12 @@ + }) + + (define_insn "*_load_mask" +- [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v") +- (vec_merge:VI12_AVX512VL +- (unspec:VI12_AVX512VL +- [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")] ++ [(set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v") ++ (vec_merge:VI12HF_AVX512VL ++ (unspec:VI12HF_AVX512VL ++ [(match_operand:VI12HF_AVX512VL 1 "memory_operand" "m")] + UNSPEC_MASKLOAD) +- (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "0C") ++ (match_operand:VI12HF_AVX512VL 2 "nonimm_or_0_operand" "0C") + (match_operand: 3 "register_operand" "Yk")))] + "TARGET_AVX512BW" + "vmovdqu\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}" +@@ -1425,9 +1425,9 @@ + (set_attr "mode" "")]) + + (define_insn_and_split "*_load" +- [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v") +- (unspec:VI12_AVX512VL +- [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")] ++ [(set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v") ++ (unspec:VI12HF_AVX512VL ++ [(match_operand:VI12HF_AVX512VL 1 "memory_operand" "m")] + UNSPEC_MASKLOAD))] + "TARGET_AVX512BW" + "#" +@@ -25973,17 +25973,21 @@ + "TARGET_AVX") + + (define_expand "maskload" +- [(set (match_operand:V48H_AVX512VL 0 "register_operand") +- (vec_merge:V48H_AVX512VL +- (match_operand:V48H_AVX512VL 1 "memory_operand") ++ [(set (match_operand:V48_AVX512VL 0 "register_operand") ++ (vec_merge:V48_AVX512VL ++ (unspec:V48_AVX512VL ++ [(match_operand:V48_AVX512VL 1 "memory_operand")] ++ UNSPEC_MASKLOAD) + (match_dup 0) + (match_operand: 2 "register_operand")))] + "TARGET_AVX512F") + + (define_expand "maskload" +- [(set (match_operand:VI12_AVX512VL 0 "register_operand") +- (vec_merge:VI12_AVX512VL +- (match_operand:VI12_AVX512VL 1 "memory_operand") ++ [(set (match_operand:VI12HF_AVX512VL 0 "register_operand") ++ (vec_merge:VI12HF_AVX512VL ++ (unspec:VI12HF_AVX512VL ++ [(match_operand:VI12HF_AVX512VL 1 "memory_operand")] ++ UNSPEC_MASKLOAD) + (match_dup 0) + (match_operand: 2 "register_operand")))] + "TARGET_AVX512BW") +diff --git a/gcc/testsuite/gcc.target/i386/pr110309.c b/gcc/testsuite/gcc.target/i386/pr110309.c +new file mode 100644 +index 000000000..f6e9e9c3c +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr110309.c +@@ -0,0 +1,10 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O3 --param vect-partial-vector-usage=1 -march=znver4 -mprefer-vector-width=256" } */ ++/* { dg-final { scan-assembler-not {(?n)vpblendd.*ymm} } } */ ++ ++ ++void foo (int * __restrict a, int *b) ++{ ++ for (int i = 0; i < 6; ++i) ++ a[i] = b[i] + 42; ++} +-- +2.28.0.windows.1 + diff --git a/0063-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch b/0063-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch new file mode 100644 index 0000000..73544d2 --- /dev/null +++ b/0063-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch @@ -0,0 +1,126 @@ +From 5ad28ef4010c1248b4d94396d03f863705f7b0db Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Mon, 26 Jun 2023 21:07:09 +0800 +Subject: [PATCH 08/32] Refine maskstore patterns with UNSPEC_MASKMOV. + +Similar like r14-2070-gc79476da46728e + +If mem_addr points to a memory region with less than whole vector size +bytes of accessible memory and k is a mask that would prevent reading +the inaccessible bytes from mem_addr, add UNSPEC_MASKMOV to prevent +it to be transformed to any other whole memory access instructions. + +gcc/ChangeLog: + + PR rtl-optimization/110237 + * config/i386/sse.md (_store_mask): Refine with + UNSPEC_MASKMOV. + (maskstore_store_mask): New define_insn, it's renamed + from original _store_mask. +--- + gcc/config/i386/sse.md | 69 ++++++++++++++++++++++++++++++++++-------- + 1 file changed, 57 insertions(+), 12 deletions(-) + +diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md +index b30e96cb1..3af159896 100644 +--- a/gcc/config/i386/sse.md ++++ b/gcc/config/i386/sse.md +@@ -1554,7 +1554,7 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + +-(define_insn "_store_mask" ++(define_insn "*_store_mask" + [(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m") + (vec_merge:V48_AVX512VL + (match_operand:V48_AVX512VL 1 "register_operand" "v") +@@ -1582,7 +1582,7 @@ + (set_attr "memory" "store") + (set_attr "mode" "")]) + +-(define_insn "_store_mask" ++(define_insn "*_store_mask" + [(set (match_operand:VI12HF_AVX512VL 0 "memory_operand" "=m") + (vec_merge:VI12HF_AVX512VL + (match_operand:VI12HF_AVX512VL 1 "register_operand" "v") +@@ -26002,21 +26002,66 @@ + "TARGET_AVX") + + (define_expand "maskstore" +- [(set (match_operand:V48H_AVX512VL 0 "memory_operand") +- (vec_merge:V48H_AVX512VL +- (match_operand:V48H_AVX512VL 1 "register_operand") +- (match_dup 0) +- (match_operand: 2 "register_operand")))] ++ [(set (match_operand:V48_AVX512VL 0 "memory_operand") ++ (unspec:V48_AVX512VL ++ [(match_operand:V48_AVX512VL 1 "register_operand") ++ (match_dup 0) ++ (match_operand: 2 "register_operand")] ++ UNSPEC_MASKMOV))] + "TARGET_AVX512F") + + (define_expand "maskstore" +- [(set (match_operand:VI12_AVX512VL 0 "memory_operand") +- (vec_merge:VI12_AVX512VL +- (match_operand:VI12_AVX512VL 1 "register_operand") +- (match_dup 0) +- (match_operand: 2 "register_operand")))] ++ [(set (match_operand:VI12HF_AVX512VL 0 "memory_operand") ++ (unspec:VI12HF_AVX512VL ++ [(match_operand:VI12HF_AVX512VL 1 "register_operand") ++ (match_dup 0) ++ (match_operand: 2 "register_operand")] ++ UNSPEC_MASKMOV))] + "TARGET_AVX512BW") + ++(define_insn "_store_mask" ++ [(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m") ++ (unspec:V48_AVX512VL ++ [(match_operand:V48_AVX512VL 1 "register_operand" "v") ++ (match_dup 0) ++ (match_operand: 2 "register_operand" "Yk")] ++ UNSPEC_MASKMOV))] ++ "TARGET_AVX512F" ++{ ++ if (FLOAT_MODE_P (GET_MODE_INNER (mode))) ++ { ++ if (misaligned_operand (operands[0], mode)) ++ return "vmovu\t{%1, %0%{%2%}|%0%{%2%}, %1}"; ++ else ++ return "vmova\t{%1, %0%{%2%}|%0%{%2%}, %1}"; ++ } ++ else ++ { ++ if (misaligned_operand (operands[0], mode)) ++ return "vmovdqu\t{%1, %0%{%2%}|%0%{%2%}, %1}"; ++ else ++ return "vmovdqa\t{%1, %0%{%2%}|%0%{%2%}, %1}"; ++ } ++} ++ [(set_attr "type" "ssemov") ++ (set_attr "prefix" "evex") ++ (set_attr "memory" "store") ++ (set_attr "mode" "")]) ++ ++(define_insn "_store_mask" ++ [(set (match_operand:VI12HF_AVX512VL 0 "memory_operand" "=m") ++ (unspec:VI12HF_AVX512VL ++ [(match_operand:VI12HF_AVX512VL 1 "register_operand" "v") ++ (match_dup 0) ++ (match_operand: 2 "register_operand" "Yk")] ++ UNSPEC_MASKMOV))] ++ "TARGET_AVX512BW" ++ "vmovdqu\t{%1, %0%{%2%}|%0%{%2%}, %1}" ++ [(set_attr "type" "ssemov") ++ (set_attr "prefix" "evex") ++ (set_attr "memory" "store") ++ (set_attr "mode" "")]) ++ + (define_expand "cbranch4" + [(set (reg:CC FLAGS_REG) + (compare:CC (match_operand:VI48_AVX 1 "register_operand") +-- +2.28.0.windows.1 + diff --git a/0064-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch b/0064-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch new file mode 100644 index 0000000..6760f16 --- /dev/null +++ b/0064-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch @@ -0,0 +1,38 @@ +From 50757adc93ef32a97a8a1083f5d53a9c00da6ac8 Mon Sep 17 00:00:00 2001 +From: "Cui, Lili" +Date: Thu, 29 Jun 2023 03:10:35 +0000 +Subject: [PATCH 09/32] x86: Update model values for Alderlake and Rocketlake. + +Update model values for Alderlake and Rocketlake according to SDM. + +gcc/ChangeLog + + * common/config/i386/cpuinfo.h (get_intel_cpu): Remove model value 0xa8 + from Rocketlake, remove model value 0xbf from Alderlake. +--- + gcc/common/config/i386/cpuinfo.h | 2 -- + 1 file changed, 2 deletions(-) + +diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h +index 0333da56b..28b2ff0b0 100644 +--- a/gcc/common/config/i386/cpuinfo.h ++++ b/gcc/common/config/i386/cpuinfo.h +@@ -435,7 +435,6 @@ get_intel_cpu (struct __processor_model *cpu_model, + cpu_model->__cpu_subtype = INTEL_COREI7_SKYLAKE; + break; + case 0xa7: +- case 0xa8: + /* Rocket Lake. */ + cpu = "rocketlake"; + CHECK___builtin_cpu_is ("corei7"); +@@ -508,7 +507,6 @@ get_intel_cpu (struct __processor_model *cpu_model, + break; + case 0x97: + case 0x9a: +- case 0xbf: + /* Alder Lake. */ + cpu = "alderlake"; + CHECK___builtin_cpu_is ("corei7"); +-- +2.28.0.windows.1 + diff --git a/0065-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch b/0065-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch new file mode 100644 index 0000000..6c6f28f --- /dev/null +++ b/0065-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch @@ -0,0 +1,78 @@ +From 60364b439a80c217174e1830e0b7507d6f4538c4 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Fri, 4 Aug 2023 09:27:39 +0800 +Subject: [PATCH 10/32] Workaround possible CPUID bug in Sandy Bridge. + +Don't access leaf 7 subleaf 1 unless subleaf 0 says it is +supported via EAX. + +Intel documentation says invalid subleaves return 0. We had been +relying on that behavior instead of checking the max sublef number. + +It appears that some Sandy Bridge CPUs return at least the subleaf 0 +EDX value for subleaf 1. Best guess is that this is a bug in a +microcode patch since all of the bits we're seeing set in EDX were +introduced after Sandy Bridge was originally released. + +This is causing avxvnniint16 to be incorrectly enabled with +-march=native on these CPUs. + +gcc/ChangeLog: + + * common/config/i386/cpuinfo.h (get_available_features): Check + max_subleaf_level for valid subleaf before use CPUID. +--- + gcc/common/config/i386/cpuinfo.h | 29 +++++++++++++++++------------ + 1 file changed, 17 insertions(+), 12 deletions(-) + +diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h +index 28b2ff0b0..316ad3cb3 100644 +--- a/gcc/common/config/i386/cpuinfo.h ++++ b/gcc/common/config/i386/cpuinfo.h +@@ -647,7 +647,9 @@ get_available_features (struct __processor_model *cpu_model, + /* Get Advanced Features at level 7 (eax = 7, ecx = 0/1). */ + if (max_cpuid_level >= 7) + { +- __cpuid_count (7, 0, eax, ebx, ecx, edx); ++ unsigned int max_subleaf_level; ++ ++ __cpuid_count (7, 0, max_subleaf_level, ebx, ecx, edx); + if (ebx & bit_BMI) + set_feature (FEATURE_BMI); + if (ebx & bit_SGX) +@@ -759,18 +761,21 @@ get_available_features (struct __processor_model *cpu_model, + set_feature (FEATURE_AVX512FP16); + } + +- __cpuid_count (7, 1, eax, ebx, ecx, edx); +- if (eax & bit_HRESET) +- set_feature (FEATURE_HRESET); +- if (avx_usable) +- { +- if (eax & bit_AVXVNNI) +- set_feature (FEATURE_AVXVNNI); +- } +- if (avx512_usable) ++ if (max_subleaf_level >= 1) + { +- if (eax & bit_AVX512BF16) +- set_feature (FEATURE_AVX512BF16); ++ __cpuid_count (7, 1, eax, ebx, ecx, edx); ++ if (eax & bit_HRESET) ++ set_feature (FEATURE_HRESET); ++ if (avx_usable) ++ { ++ if (eax & bit_AVXVNNI) ++ set_feature (FEATURE_AVXVNNI); ++ } ++ if (avx512_usable) ++ { ++ if (eax & bit_AVX512BF16) ++ set_feature (FEATURE_AVX512BF16); ++ } + } + } + +-- +2.28.0.windows.1 +