!417 [Sync] Sync patch from openeuler/gcc

From: @wangding16 
Reviewed-by: @huang-xiaoquan 
Signed-off-by: @huang-xiaoquan
This commit is contained in:
openeuler-ci-bot 2024-04-24 11:35:33 +00:00 committed by Gitee
commit 47d51f9f9c
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
33 changed files with 5645 additions and 1 deletions

View File

@ -0,0 +1,27 @@
From fa6f80044dcebd28506e871e6e5d25e2dfd7e105 Mon Sep 17 00:00:00 2001
From: tiancheng-bao <baotiancheng1@huawei.com>
Date: Fri, 12 Apr 2024 15:09:28 +0800
Subject: [PATCH 01/32] Fix bug that verifying gimple failed when reorg-level >
5
---
gcc/ipa-struct-reorg/ipa-struct-reorg.cc | 3 +++
1 file changed, 3 insertions(+)
diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
index f03d1d875..e08577c0c 100644
--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
+++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
@@ -7461,6 +7461,9 @@ ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi)
continue;
tree lhs_expr = newlhs[i] ? newlhs[i] : lhs;
tree rhs_expr = newrhs[i] ? newrhs[i] : rhs;
+ if (!useless_type_conversion_p (TREE_TYPE (lhs_expr),
+ TREE_TYPE (rhs_expr)))
+ rhs_expr = gimplify_build1 (gsi, NOP_EXPR, TREE_TYPE (lhs_expr), rhs_expr);
gimple *newstmt = gimple_build_assign (lhs_expr, rhs_expr);
if (dump_file && (dump_flags & TDF_DETAILS))
{
--
2.28.0.windows.1

View File

@ -0,0 +1,90 @@
From 13e82fccba781b29e55a6e1934986514019b728d Mon Sep 17 00:00:00 2001
From: zhenyu--zhao <zhaozhenyu17@huawei.com>
Date: Sun, 24 Mar 2024 20:42:27 +0800
Subject: [PATCH 02/32] [AutoFdo] Fix memory leaks in autofdo
---
gcc/final.cc | 22 ++++++++++++++--------
1 file changed, 14 insertions(+), 8 deletions(-)
diff --git a/gcc/final.cc b/gcc/final.cc
index d4c4fa08f..af4e529bb 100644
--- a/gcc/final.cc
+++ b/gcc/final.cc
@@ -4402,12 +4402,15 @@ get_fdo_count_quality (profile_count count)
return profile_quality[count.quality ()];
}
-static const char *
+/* If the function is not public, return the function_name/file_name for
+ disambiguation of local symbols since there could be identical function
+ names coming from identical file names. The caller needs to free memory. */
+static char *
alias_local_functions (const char *fnname)
{
if (TREE_PUBLIC (cfun->decl))
{
- return fnname;
+ return concat (fnname, NULL);
}
return concat (fnname, "/", lbasename (dump_base_name), NULL);
}
@@ -4457,12 +4460,13 @@ dump_direct_callee_info_to_asm (basic_block bb, gcov_type call_count)
if (callee)
{
+ char *func_name =
+ alias_local_functions (get_fnname_from_decl (callee));
fprintf (asm_out_file, "\t.string \"%x\"\n",
INSN_ADDRESSES (INSN_UID (insn)));
fprintf (asm_out_file, "\t.string \"%s%s\"\n",
- ASM_FDO_CALLEE_FLAG,
- alias_local_functions (get_fnname_from_decl (callee)));
+ ASM_FDO_CALLEE_FLAG, func_name);
fprintf (asm_out_file,
"\t.string \"" HOST_WIDE_INT_PRINT_DEC "\"\n",
@@ -4472,9 +4476,9 @@ dump_direct_callee_info_to_asm (basic_block bb, gcov_type call_count)
{
fprintf (dump_file, "call: %x --> %s \n",
INSN_ADDRESSES (INSN_UID (insn)),
- alias_local_functions
- (get_fnname_from_decl (callee)));
+ func_name);
}
+ free (func_name);
}
}
}
@@ -4547,8 +4551,9 @@ dump_bb_info_to_asm (basic_block bb, gcov_type bb_count)
static void
dump_function_info_to_asm (const char *fnname)
{
+ char *func_name = alias_local_functions (fnname);
fprintf (asm_out_file, "\t.string \"%s%s\"\n",
- ASM_FDO_CALLER_FLAG, alias_local_functions (fnname));
+ ASM_FDO_CALLER_FLAG, func_name);
fprintf (asm_out_file, "\t.string \"%s%d\"\n",
ASM_FDO_CALLER_SIZE_FLAG, get_function_end_addr ());
fprintf (asm_out_file, "\t.string \"%s%s\"\n",
@@ -4557,7 +4562,7 @@ dump_function_info_to_asm (const char *fnname)
if (dump_file)
{
fprintf (dump_file, "\n FUNC_NAME: %s\n",
- alias_local_functions (fnname));
+ func_name);
fprintf (dump_file, " file: %s\n",
dump_base_name);
fprintf (dump_file, "profile_status: %s\n",
@@ -4567,6 +4572,7 @@ dump_function_info_to_asm (const char *fnname)
fprintf (dump_file, " function_bind: %s\n",
simple_get_function_bind ());
}
+ free (func_name);
}
/* Dump function profile into form AutoFDO or PGO to asm. */
--
2.28.0.windows.1

View File

@ -0,0 +1,135 @@
From 1649f9fbbc5267de2a675336d3ac665528a03db8 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 10 May 2023 15:16:58 +0800
Subject: [PATCH 03/32] x86: Add a new option -mdaz-ftz to enable FTZ and DAZ
flags in MXCSR.
if (mdaz-ftz)
link crtfastmath.o
else if ((Ofast || ffast-math || funsafe-math-optimizations)
&& !mno-daz-ftz)
link crtfastmath.o
else
Don't link crtfastmath.o
gcc/ChangeLog:
* config/i386/cygwin.h (ENDFILE_SPEC): Link crtfastmath.o
whenever -mdaz-ftz is specified. Don't link crtfastmath.o
when -mno-daz-ftz is specified.
* config/i386/darwin.h (ENDFILE_SPEC): Ditto.
* config/i386/gnu-user-common.h
(GNU_USER_TARGET_MATHFILE_SPEC): Ditto.
* config/i386/mingw32.h (ENDFILE_SPEC): Ditto.
* config/i386/i386.opt (mdaz-ftz): New option.
* doc/invoke.texi (x86 options): Document mftz-daz.
---
gcc/config/i386/cygwin.h | 2 +-
gcc/config/i386/darwin.h | 4 ++--
gcc/config/i386/gnu-user-common.h | 2 +-
gcc/config/i386/i386.opt | 4 ++++
gcc/config/i386/mingw32.h | 2 +-
gcc/doc/invoke.texi | 11 ++++++++++-
6 files changed, 19 insertions(+), 6 deletions(-)
diff --git a/gcc/config/i386/cygwin.h b/gcc/config/i386/cygwin.h
index d06eda369..5412c5d44 100644
--- a/gcc/config/i386/cygwin.h
+++ b/gcc/config/i386/cygwin.h
@@ -57,7 +57,7 @@ along with GCC; see the file COPYING3. If not see
#undef ENDFILE_SPEC
#define ENDFILE_SPEC \
- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}\
+ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \
%{!shared:%:if-exists(default-manifest.o%s)}\
%{fvtable-verify=none:%s; \
fvtable-verify=preinit:vtv_end.o%s; \
diff --git a/gcc/config/i386/darwin.h b/gcc/config/i386/darwin.h
index a55f6b2b8..2f773924d 100644
--- a/gcc/config/i386/darwin.h
+++ b/gcc/config/i386/darwin.h
@@ -109,8 +109,8 @@ along with GCC; see the file COPYING3. If not see
"%{!force_cpusubtype_ALL:-force_cpusubtype_ALL} "
#undef ENDFILE_SPEC
-#define ENDFILE_SPEC \
- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+#define ENDFILE_SPEC
+\ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \
%{mpc32:crtprec32.o%s} \
%{mpc64:crtprec64.o%s} \
%{mpc80:crtprec80.o%s}" TM_DESTRUCTOR
diff --git a/gcc/config/i386/gnu-user-common.h b/gcc/config/i386/gnu-user-common.h
index 23b54c5be..3d2a33f17 100644
--- a/gcc/config/i386/gnu-user-common.h
+++ b/gcc/config/i386/gnu-user-common.h
@@ -47,7 +47,7 @@ along with GCC; see the file COPYING3. If not see
/* Similar to standard GNU userspace, but adding -ffast-math support. */
#define GNU_USER_TARGET_MATHFILE_SPEC \
- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \
%{mpc32:crtprec32.o%s} \
%{mpc64:crtprec64.o%s} \
%{mpc80:crtprec80.o%s}"
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index fc1b944ac..498fb454d 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -420,6 +420,10 @@ mpc80
Target RejectNegative
Set 80387 floating-point precision to 80-bit.
+mdaz-ftz
+Target
+Set the FTZ and DAZ Flags.
+
mpreferred-stack-boundary=
Target RejectNegative Joined UInteger Var(ix86_preferred_stack_boundary_arg)
Attempt to keep stack aligned to this power of 2.
diff --git a/gcc/config/i386/mingw32.h b/gcc/config/i386/mingw32.h
index d3ca0cd02..ddbe6a405 100644
--- a/gcc/config/i386/mingw32.h
+++ b/gcc/config/i386/mingw32.h
@@ -197,7 +197,7 @@ along with GCC; see the file COPYING3. If not see
#undef ENDFILE_SPEC
#define ENDFILE_SPEC \
- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \
%{!shared:%:if-exists(default-manifest.o%s)}\
%{fvtable-verify=none:%s; \
fvtable-verify=preinit:vtv_end.o%s; \
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 2b376e0e9..3a48655e5 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1437,7 +1437,7 @@ See RS/6000 and PowerPC Options.
-m96bit-long-double -mlong-double-64 -mlong-double-80 -mlong-double-128 @gol
-mregparm=@var{num} -msseregparm @gol
-mveclibabi=@var{type} -mvect8-ret-in-mem @gol
--mpc32 -mpc64 -mpc80 -mstackrealign @gol
+-mpc32 -mpc64 -mpc80 -mdaz-ftz -mstackrealign @gol
-momit-leaf-frame-pointer -mno-red-zone -mno-tls-direct-seg-refs @gol
-mcmodel=@var{code-model} -mabi=@var{name} -maddress-mode=@var{mode} @gol
-m32 -m64 -mx32 -m16 -miamcu -mlarge-data-threshold=@var{num} @gol
@@ -32122,6 +32122,15 @@ are enabled by default; routines in such libraries could suffer significant
loss of accuracy, typically through so-called ``catastrophic cancellation'',
when this option is used to set the precision to less than extended precision.
+@item -mdaz-ftz
+@opindex mdaz-ftz
+
+The flush-to-zero (FTZ) and denormals-are-zero (DAZ) flags in the MXCSR register
+are used to control floating-point calculations.SSE and AVX instructions
+including scalar and vector instructions could benefit from enabling the FTZ
+and DAZ flags when @option{-mdaz-ftz} is specified. Don't set FTZ/DAZ flags
+when @option{-mno-daz-ftz} is specified.
+
@item -mstackrealign
@opindex mstackrealign
Realign the stack at entry. On the x86, the @option{-mstackrealign}
--
2.28.0.windows.1

View File

@ -0,0 +1,65 @@
From e70fa730dcfcb3a7b1d56a2e166752d4299f0504 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Mon, 5 Jun 2023 12:38:41 +0800
Subject: [PATCH 04/32] Explicitly view_convert_expr mask to signed type when
folding pblendvb builtins.
Since mask < 0 will be always false for vector char when
-funsigned-char, but vpblendvb needs to check the most significant
bit. The patch explicitly VCE to vector signed char.
gcc/ChangeLog:
PR target/110108
* config/i386/i386.cc (ix86_gimple_fold_builtin): Explicitly
view_convert_expr mask to signed type when folding pblendvb
builtins.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr110108-2.c: New test.
---
gcc/config/i386/i386.cc | 4 +++-
gcc/testsuite/gcc.target/i386/pr110108-2.c | 14 ++++++++++++++
2 files changed, 17 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr110108-2.c
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 462dce10e..479fc6010 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -18396,8 +18396,10 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
tree itype = GET_MODE_INNER (TYPE_MODE (type)) == E_SFmode
? intSI_type_node : intDI_type_node;
type = get_same_sized_vectype (itype, type);
- arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2);
}
+ else
+ type = signed_type_for (type);
+ arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2);
tree zero_vec = build_zero_cst (type);
tree cmp_type = truth_type_for (type);
tree cmp = gimple_build (&stmts, LT_EXPR, cmp_type, arg2, zero_vec);
diff --git a/gcc/testsuite/gcc.target/i386/pr110108-2.c b/gcc/testsuite/gcc.target/i386/pr110108-2.c
new file mode 100644
index 000000000..2d1d2fd49
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110108-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2 -funsigned-char" } */
+/* { dg-final { scan-assembler-times "vpblendvb" 2 } } */
+
+#include <immintrin.h>
+__m128i do_stuff_128(__m128i X0, __m128i X1, __m128i X2) {
+ __m128i Result = _mm_blendv_epi8(X0, X1, X2);
+ return Result;
+}
+
+__m256i do_stuff_256(__m256i X0, __m256i X1, __m256i X2) {
+ __m256i Result = _mm256_blendv_epi8(X0, X1, X2);
+ return Result;
+}
--
2.28.0.windows.1

View File

@ -0,0 +1,138 @@
From 48715f03ad08f185153bfb0ff4c0802ab2d9579c Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Mon, 26 Jun 2023 09:50:25 +0800
Subject: [PATCH 05/32] Make option mvzeroupper independent of optimization
level.
pass_insert_vzeroupper is under condition
TARGET_AVX && TARGET_VZEROUPPER
&& flag_expensive_optimizations && !optimize_size
But the document of mvzeroupper doesn't mention the insertion
required -O2 and above, it may confuse users when they explicitly
use -Os -mvzeroupper.
------------
mvzeroupper
Target Mask(VZEROUPPER) Save
Generate vzeroupper instruction before a transfer of control flow out of
the function.
------------
The patch moves flag_expensive_optimizations && !optimize_size to
ix86_option_override_internal. It makes -mvzeroupper independent of
optimization level, but still keeps the behavior of architecture
tuning(emit_vzeroupper) unchanged.
gcc/ChangeLog:
* config/i386/i386-features.cc (pass_insert_vzeroupper:gate):
Move flag_expensive_optimizations && !optimize_size to ..
* config/i386/i386-options.cc (ix86_option_override_internal):
.. this, it makes -mvzeroupper independent of optimization
level, but still keeps the behavior of architecture
tuning(emit_vzeroupper) unchanged.
gcc/testsuite/ChangeLog:
* gcc.target/i386/avx-vzeroupper-29.c: New testcase.
* gcc.target/i386/avx-vzeroupper-12.c: Adjust testcase.
* gcc.target/i386/avx-vzeroupper-7.c: Ditto.
* gcc.target/i386/avx-vzeroupper-9.c: Ditto.
---
gcc/config/i386/i386-features.cc | 3 +--
gcc/config/i386/i386-options.cc | 4 +++-
gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c | 3 ++-
gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c | 14 ++++++++++++++
gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c | 3 ++-
gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c | 3 ++-
6 files changed, 24 insertions(+), 6 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 6fe41c3c2..6a2444eb6 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -1875,8 +1875,7 @@ public:
/* opt_pass methods: */
virtual bool gate (function *)
{
- return TARGET_AVX && TARGET_VZEROUPPER
- && flag_expensive_optimizations && !optimize_size;
+ return TARGET_AVX && TARGET_VZEROUPPER;
}
virtual unsigned int execute (function *)
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index ff44ad4e0..74e969b68 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2702,7 +2702,9 @@ ix86_option_override_internal (bool main_args_p,
sorry ("%<-mcall-ms2sysv-xlogues%> isn%'t currently supported with SEH");
if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
- && TARGET_EMIT_VZEROUPPER)
+ && TARGET_EMIT_VZEROUPPER
+ && flag_expensive_optimizations
+ && !optimize_size)
opts->x_target_flags |= MASK_VZEROUPPER;
if (!(opts_set->x_target_flags & MASK_STV))
opts->x_target_flags |= MASK_STV;
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c
index e694d4048..5a40e8783 100644
--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c
@@ -16,5 +16,6 @@ foo ()
_mm256_zeroupper ();
}
-/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 } } */
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 5 { target { ! ia32 } } } } */
/* { dg-final { scan-assembler-times "\\*avx_vzeroall" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c
new file mode 100644
index 000000000..4af637757
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mtune=generic -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+ x = y;
+}
+
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c
index ab6d68779..75fe58897 100644
--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c
@@ -12,4 +12,5 @@ foo ()
_mm256_zeroupper ();
}
-/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 2 { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c
index 974e1626a..fa0a6dfca 100644
--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c
@@ -15,4 +15,5 @@ foo ()
_mm256_zeroupper ();
}
-/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 } } */
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 5 { target { ! ia32 } } } } */
--
2.28.0.windows.1

View File

@ -0,0 +1,68 @@
From 8039d773354360ed8ff2f25c63843fc637eacc67 Mon Sep 17 00:00:00 2001
From: Hongyu Wang <hongyu.wang@intel.com>
Date: Sun, 25 Jun 2023 09:50:21 +0800
Subject: [PATCH 06/32] i386: Sync tune_string with arch_string for target
attribute
arch=*
For function with target attribute arch=*, current logic will set its
tune to -mtune from command line so all target_clones will get same
tuning flags which would affect the performance for each clone. Override
tune with arch if tune was not explicitly specified to get proper tuning
flags for target_clones.
gcc/ChangeLog:
* config/i386/i386-options.cc (ix86_valid_target_attribute_tree):
Override tune_string with arch_string if tune_string is not
explicitly specified.
gcc/testsuite/ChangeLog:
* gcc.target/i386/mvc17.c: New test.
(cherry picked from commit 2916278d14e9ac28c361c396a67256acbebda6e8)
---
gcc/config/i386/i386-options.cc | 6 +++++-
gcc/testsuite/gcc.target/i386/mvc17.c | 11 +++++++++++
2 files changed, 16 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.target/i386/mvc17.c
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 74e969b68..fb2ed942f 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -1378,7 +1378,11 @@ ix86_valid_target_attribute_tree (tree fndecl, tree args,
if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
opts->x_ix86_tune_string
= ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
- else if (orig_tune_defaulted)
+ /* If we have explicit arch string and no tune string specified, set
+ tune_string to NULL and later it will be overriden by arch_string
+ so target clones can get proper optimization. */
+ else if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
+ || orig_tune_defaulted)
opts->x_ix86_tune_string = NULL;
/* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
diff --git a/gcc/testsuite/gcc.target/i386/mvc17.c b/gcc/testsuite/gcc.target/i386/mvc17.c
new file mode 100644
index 000000000..8b83c1aec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/mvc17.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-require-ifunc "" } */
+/* { dg-options "-O2 -march=x86-64" } */
+/* { dg-final { scan-assembler-times "rep mov" 1 } } */
+
+__attribute__((target_clones("default","arch=icelake-server")))
+void
+foo (char *a, char *b, int size)
+{
+ __builtin_memcpy (a, b, size & 0x7F);
+}
--
2.28.0.windows.1

View File

@ -0,0 +1,111 @@
From fbcb1a5899b1bd3964aed78ed74041121e618d36 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Tue, 20 Jun 2023 15:41:00 +0800
Subject: [PATCH 07/32] Refine maskloadmn pattern with UNSPEC_MASKLOAD.
If mem_addr points to a memory region with less than whole vector size
bytes of accessible memory and k is a mask that would prevent reading
the inaccessible bytes from mem_addr, add UNSPEC_MASKLOAD to prevent
it to be transformed to vpblendd.
gcc/ChangeLog:
PR target/110309
* config/i386/sse.md (maskload<mode><avx512fmaskmodelower>):
Refine pattern with UNSPEC_MASKLOAD.
(maskload<mode><avx512fmaskmodelower>): Ditto.
(*<avx512>_load<mode>_mask): Extend mode iterator to
VI12HF_AVX512VL.
(*<avx512>_load<mode>): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr110309.c: New test.
---
gcc/config/i386/sse.md | 32 +++++++++++++-----------
gcc/testsuite/gcc.target/i386/pr110309.c | 10 ++++++++
2 files changed, 28 insertions(+), 14 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr110309.c
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index eb767e56c..b30e96cb1 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1411,12 +1411,12 @@
})
(define_insn "*<avx512>_load<mode>_mask"
- [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
- (vec_merge:VI12_AVX512VL
- (unspec:VI12_AVX512VL
- [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")]
+ [(set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v")
+ (vec_merge:VI12HF_AVX512VL
+ (unspec:VI12HF_AVX512VL
+ [(match_operand:VI12HF_AVX512VL 1 "memory_operand" "m")]
UNSPEC_MASKLOAD)
- (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "0C")
+ (match_operand:VI12HF_AVX512VL 2 "nonimm_or_0_operand" "0C")
(match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))]
"TARGET_AVX512BW"
"vmovdqu<ssescalarsize>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
@@ -1425,9 +1425,9 @@
(set_attr "mode" "<sseinsnmode>")])
(define_insn_and_split "*<avx512>_load<mode>"
- [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
- (unspec:VI12_AVX512VL
- [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")]
+ [(set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v")
+ (unspec:VI12HF_AVX512VL
+ [(match_operand:VI12HF_AVX512VL 1 "memory_operand" "m")]
UNSPEC_MASKLOAD))]
"TARGET_AVX512BW"
"#"
@@ -25973,17 +25973,21 @@
"TARGET_AVX")
(define_expand "maskload<mode><avx512fmaskmodelower>"
- [(set (match_operand:V48H_AVX512VL 0 "register_operand")
- (vec_merge:V48H_AVX512VL
- (match_operand:V48H_AVX512VL 1 "memory_operand")
+ [(set (match_operand:V48_AVX512VL 0 "register_operand")
+ (vec_merge:V48_AVX512VL
+ (unspec:V48_AVX512VL
+ [(match_operand:V48_AVX512VL 1 "memory_operand")]
+ UNSPEC_MASKLOAD)
(match_dup 0)
(match_operand:<avx512fmaskmode> 2 "register_operand")))]
"TARGET_AVX512F")
(define_expand "maskload<mode><avx512fmaskmodelower>"
- [(set (match_operand:VI12_AVX512VL 0 "register_operand")
- (vec_merge:VI12_AVX512VL
- (match_operand:VI12_AVX512VL 1 "memory_operand")
+ [(set (match_operand:VI12HF_AVX512VL 0 "register_operand")
+ (vec_merge:VI12HF_AVX512VL
+ (unspec:VI12HF_AVX512VL
+ [(match_operand:VI12HF_AVX512VL 1 "memory_operand")]
+ UNSPEC_MASKLOAD)
(match_dup 0)
(match_operand:<avx512fmaskmode> 2 "register_operand")))]
"TARGET_AVX512BW")
diff --git a/gcc/testsuite/gcc.target/i386/pr110309.c b/gcc/testsuite/gcc.target/i386/pr110309.c
new file mode 100644
index 000000000..f6e9e9c3c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110309.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 --param vect-partial-vector-usage=1 -march=znver4 -mprefer-vector-width=256" } */
+/* { dg-final { scan-assembler-not {(?n)vpblendd.*ymm} } } */
+
+
+void foo (int * __restrict a, int *b)
+{
+ for (int i = 0; i < 6; ++i)
+ a[i] = b[i] + 42;
+}
--
2.28.0.windows.1

View File

@ -0,0 +1,126 @@
From 5ad28ef4010c1248b4d94396d03f863705f7b0db Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Mon, 26 Jun 2023 21:07:09 +0800
Subject: [PATCH 08/32] Refine maskstore patterns with UNSPEC_MASKMOV.
Similar like r14-2070-gc79476da46728e
If mem_addr points to a memory region with less than whole vector size
bytes of accessible memory and k is a mask that would prevent reading
the inaccessible bytes from mem_addr, add UNSPEC_MASKMOV to prevent
it to be transformed to any other whole memory access instructions.
gcc/ChangeLog:
PR rtl-optimization/110237
* config/i386/sse.md (<avx512>_store<mode>_mask): Refine with
UNSPEC_MASKMOV.
(maskstore<mode><avx512fmaskmodelower): Ditto.
(*<avx512>_store<mode>_mask): New define_insn, it's renamed
from original <avx512>_store<mode>_mask.
---
gcc/config/i386/sse.md | 69 ++++++++++++++++++++++++++++++++++--------
1 file changed, 57 insertions(+), 12 deletions(-)
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index b30e96cb1..3af159896 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1554,7 +1554,7 @@
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
-(define_insn "<avx512>_store<mode>_mask"
+(define_insn "*<avx512>_store<mode>_mask"
[(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m")
(vec_merge:V48_AVX512VL
(match_operand:V48_AVX512VL 1 "register_operand" "v")
@@ -1582,7 +1582,7 @@
(set_attr "memory" "store")
(set_attr "mode" "<sseinsnmode>")])
-(define_insn "<avx512>_store<mode>_mask"
+(define_insn "*<avx512>_store<mode>_mask"
[(set (match_operand:VI12HF_AVX512VL 0 "memory_operand" "=m")
(vec_merge:VI12HF_AVX512VL
(match_operand:VI12HF_AVX512VL 1 "register_operand" "v")
@@ -26002,21 +26002,66 @@
"TARGET_AVX")
(define_expand "maskstore<mode><avx512fmaskmodelower>"
- [(set (match_operand:V48H_AVX512VL 0 "memory_operand")
- (vec_merge:V48H_AVX512VL
- (match_operand:V48H_AVX512VL 1 "register_operand")
- (match_dup 0)
- (match_operand:<avx512fmaskmode> 2 "register_operand")))]
+ [(set (match_operand:V48_AVX512VL 0 "memory_operand")
+ (unspec:V48_AVX512VL
+ [(match_operand:V48_AVX512VL 1 "register_operand")
+ (match_dup 0)
+ (match_operand:<avx512fmaskmode> 2 "register_operand")]
+ UNSPEC_MASKMOV))]
"TARGET_AVX512F")
(define_expand "maskstore<mode><avx512fmaskmodelower>"
- [(set (match_operand:VI12_AVX512VL 0 "memory_operand")
- (vec_merge:VI12_AVX512VL
- (match_operand:VI12_AVX512VL 1 "register_operand")
- (match_dup 0)
- (match_operand:<avx512fmaskmode> 2 "register_operand")))]
+ [(set (match_operand:VI12HF_AVX512VL 0 "memory_operand")
+ (unspec:VI12HF_AVX512VL
+ [(match_operand:VI12HF_AVX512VL 1 "register_operand")
+ (match_dup 0)
+ (match_operand:<avx512fmaskmode> 2 "register_operand")]
+ UNSPEC_MASKMOV))]
"TARGET_AVX512BW")
+(define_insn "<avx512>_store<mode>_mask"
+ [(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m")
+ (unspec:V48_AVX512VL
+ [(match_operand:V48_AVX512VL 1 "register_operand" "v")
+ (match_dup 0)
+ (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")]
+ UNSPEC_MASKMOV))]
+ "TARGET_AVX512F"
+{
+ if (FLOAT_MODE_P (GET_MODE_INNER (<MODE>mode)))
+ {
+ if (misaligned_operand (operands[0], <MODE>mode))
+ return "vmovu<ssemodesuffix>\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+ else
+ return "vmova<ssemodesuffix>\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+ }
+ else
+ {
+ if (misaligned_operand (operands[0], <MODE>mode))
+ return "vmovdqu<ssescalarsize>\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+ else
+ return "vmovdqa<ssescalarsize>\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+ }
+}
+ [(set_attr "type" "ssemov")
+ (set_attr "prefix" "evex")
+ (set_attr "memory" "store")
+ (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<avx512>_store<mode>_mask"
+ [(set (match_operand:VI12HF_AVX512VL 0 "memory_operand" "=m")
+ (unspec:VI12HF_AVX512VL
+ [(match_operand:VI12HF_AVX512VL 1 "register_operand" "v")
+ (match_dup 0)
+ (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")]
+ UNSPEC_MASKMOV))]
+ "TARGET_AVX512BW"
+ "vmovdqu<ssescalarsize>\t{%1, %0%{%2%}|%0%{%2%}, %1}"
+ [(set_attr "type" "ssemov")
+ (set_attr "prefix" "evex")
+ (set_attr "memory" "store")
+ (set_attr "mode" "<sseinsnmode>")])
+
(define_expand "cbranch<mode>4"
[(set (reg:CC FLAGS_REG)
(compare:CC (match_operand:VI48_AVX 1 "register_operand")
--
2.28.0.windows.1

View File

@ -0,0 +1,38 @@
From 50757adc93ef32a97a8a1083f5d53a9c00da6ac8 Mon Sep 17 00:00:00 2001
From: "Cui, Lili" <lili.cui@intel.com>
Date: Thu, 29 Jun 2023 03:10:35 +0000
Subject: [PATCH 09/32] x86: Update model values for Alderlake and Rocketlake.
Update model values for Alderlake and Rocketlake according to SDM.
gcc/ChangeLog
* common/config/i386/cpuinfo.h (get_intel_cpu): Remove model value 0xa8
from Rocketlake, remove model value 0xbf from Alderlake.
---
gcc/common/config/i386/cpuinfo.h | 2 --
1 file changed, 2 deletions(-)
diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 0333da56b..28b2ff0b0 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -435,7 +435,6 @@ get_intel_cpu (struct __processor_model *cpu_model,
cpu_model->__cpu_subtype = INTEL_COREI7_SKYLAKE;
break;
case 0xa7:
- case 0xa8:
/* Rocket Lake. */
cpu = "rocketlake";
CHECK___builtin_cpu_is ("corei7");
@@ -508,7 +507,6 @@ get_intel_cpu (struct __processor_model *cpu_model,
break;
case 0x97:
case 0x9a:
- case 0xbf:
/* Alder Lake. */
cpu = "alderlake";
CHECK___builtin_cpu_is ("corei7");
--
2.28.0.windows.1

View File

@ -0,0 +1,78 @@
From 60364b439a80c217174e1830e0b7507d6f4538c4 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Fri, 4 Aug 2023 09:27:39 +0800
Subject: [PATCH 10/32] Workaround possible CPUID bug in Sandy Bridge.
Don't access leaf 7 subleaf 1 unless subleaf 0 says it is
supported via EAX.
Intel documentation says invalid subleaves return 0. We had been
relying on that behavior instead of checking the max sublef number.
It appears that some Sandy Bridge CPUs return at least the subleaf 0
EDX value for subleaf 1. Best guess is that this is a bug in a
microcode patch since all of the bits we're seeing set in EDX were
introduced after Sandy Bridge was originally released.
This is causing avxvnniint16 to be incorrectly enabled with
-march=native on these CPUs.
gcc/ChangeLog:
* common/config/i386/cpuinfo.h (get_available_features): Check
max_subleaf_level for valid subleaf before use CPUID.
---
gcc/common/config/i386/cpuinfo.h | 29 +++++++++++++++++------------
1 file changed, 17 insertions(+), 12 deletions(-)
diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 28b2ff0b0..316ad3cb3 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -647,7 +647,9 @@ get_available_features (struct __processor_model *cpu_model,
/* Get Advanced Features at level 7 (eax = 7, ecx = 0/1). */
if (max_cpuid_level >= 7)
{
- __cpuid_count (7, 0, eax, ebx, ecx, edx);
+ unsigned int max_subleaf_level;
+
+ __cpuid_count (7, 0, max_subleaf_level, ebx, ecx, edx);
if (ebx & bit_BMI)
set_feature (FEATURE_BMI);
if (ebx & bit_SGX)
@@ -759,18 +761,21 @@ get_available_features (struct __processor_model *cpu_model,
set_feature (FEATURE_AVX512FP16);
}
- __cpuid_count (7, 1, eax, ebx, ecx, edx);
- if (eax & bit_HRESET)
- set_feature (FEATURE_HRESET);
- if (avx_usable)
- {
- if (eax & bit_AVXVNNI)
- set_feature (FEATURE_AVXVNNI);
- }
- if (avx512_usable)
+ if (max_subleaf_level >= 1)
{
- if (eax & bit_AVX512BF16)
- set_feature (FEATURE_AVX512BF16);
+ __cpuid_count (7, 1, eax, ebx, ecx, edx);
+ if (eax & bit_HRESET)
+ set_feature (FEATURE_HRESET);
+ if (avx_usable)
+ {
+ if (eax & bit_AVXVNNI)
+ set_feature (FEATURE_AVXVNNI);
+ }
+ if (avx512_usable)
+ {
+ if (eax & bit_AVX512BF16)
+ set_feature (FEATURE_AVX512BF16);
+ }
}
}
--
2.28.0.windows.1

View File

@ -0,0 +1,220 @@
From cfffbec938afdc45c31db5ec282ce21ad1ba2dc7 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Thu, 10 Aug 2023 11:41:39 +0800
Subject: [PATCH 11/32] Software mitigation: Disable gather generation in
vectorization for GDS affected Intel Processors.
For more details of GDS (Gather Data Sampling), refer to
https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/gather-data-sampling.html
After microcode update, there's performance regression. To avoid that,
the patch disables gather generation in autovectorization but uses
gather scalar emulation instead.
gcc/ChangeLog:
* config/i386/i386-options.cc (m_GDS): New macro.
* config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Don't
enable for m_GDS.
(X86_TUNE_USE_GATHER_4PARTS): Ditto.
(X86_TUNE_USE_GATHER): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/avx2-gather-2.c: Adjust options to keep
gather vectorization.
* gcc.target/i386/avx2-gather-6.c: Ditto.
* gcc.target/i386/avx512f-pr88464-1.c: Ditto.
* gcc.target/i386/avx512f-pr88464-5.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-1.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-11.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-3.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-9.c: Ditto.
* gcc.target/i386/pr88531-1b.c: Ditto.
* gcc.target/i386/pr88531-1c.c: Ditto.
(cherry picked from commit 3064d1f5c48cb6ce1b4133570dd08ecca8abb52d)
---
gcc/config/i386/i386-options.cc | 5 +++++
gcc/config/i386/x86-tune.def | 9 ++++++---
gcc/testsuite/gcc.target/i386/avx2-gather-2.c | 2 +-
gcc/testsuite/gcc.target/i386/avx2-gather-6.c | 2 +-
gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c | 2 +-
gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c | 2 +-
gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c | 2 +-
gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c | 2 +-
gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c | 2 +-
gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c | 2 +-
gcc/testsuite/gcc.target/i386/pr88531-1b.c | 2 +-
gcc/testsuite/gcc.target/i386/pr88531-1c.c | 2 +-
12 files changed, 21 insertions(+), 13 deletions(-)
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index fb2ed942f..9617fc162 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -137,6 +137,11 @@ along with GCC; see the file COPYING3. If not see
#define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS)
#define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT)
#define m_INTEL (HOST_WIDE_INT_1U<<PROCESSOR_INTEL)
+/* Gather Data Sampling / CVE-2022-40982 / INTEL-SA-00828.
+ Software mitigation. */
+#define m_GDS (m_SKYLAKE | m_SKYLAKE_AVX512 | m_CANNONLAKE \
+ | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_CASCADELAKE \
+ | m_TIGERLAKE | m_COOPERLAKE | m_ROCKETLAKE)
#define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE)
#define m_K6 (HOST_WIDE_INT_1U<<PROCESSOR_K6)
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index e6b9e2125..4392709fc 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -467,7 +467,8 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
/* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
elements. */
DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE | m_GENERIC))
+ ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE
+ | m_GENERIC | m_GDS))
/* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
elements. */
@@ -477,7 +478,8 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
/* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
elements. */
DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE | m_GENERIC))
+ ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE
+ | m_GENERIC | m_GDS))
/* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
elements. */
@@ -487,7 +489,8 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
/* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
elements. */
DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE | m_GENERIC))
+ ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE
+ | m_GENERIC | m_GDS))
/* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
elements. */
diff --git a/gcc/testsuite/gcc.target/i386/avx2-gather-2.c b/gcc/testsuite/gcc.target/i386/avx2-gather-2.c
index ad5ef7310..978924b0f 100644
--- a/gcc/testsuite/gcc.target/i386/avx2-gather-2.c
+++ b/gcc/testsuite/gcc.target/i386/avx2-gather-2.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O3 -fdump-tree-vect-details -march=skylake" } */
+/* { dg-options "-O3 -fdump-tree-vect-details -march=skylake -mtune=haswell" } */
#include "avx2-gather-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/avx2-gather-6.c b/gcc/testsuite/gcc.target/i386/avx2-gather-6.c
index b9119581a..067b251e3 100644
--- a/gcc/testsuite/gcc.target/i386/avx2-gather-6.c
+++ b/gcc/testsuite/gcc.target/i386/avx2-gather-6.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O3 -mavx2 -fno-common -fdump-tree-vect-details -mtune=skylake" } */
+/* { dg-options "-O3 -mavx2 -fno-common -fdump-tree-vect-details -mtune=haswell" } */
#include "avx2-gather-5.c"
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c
index 06d21bb01..d1a229861 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c
@@ -1,6 +1,6 @@
/* PR tree-optimization/88464 */
/* { dg-do compile } */
-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=haswell -fdump-tree-vect-details" } */
/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 4 "vect" } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c
index 462e951fd..d7b0b2b28 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c
@@ -1,6 +1,6 @@
/* PR tree-optimization/88464 */
/* { dg-do compile } */
-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=haswell -fdump-tree-vect-details" } */
/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 4 "vect" } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c
index 55a28dddb..07439185e 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c
@@ -1,6 +1,6 @@
/* PR tree-optimization/88464 */
/* { dg-do compile } */
-/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
+/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=haswell -fdump-tree-vect-details" } */
/* { dg-final { scan-tree-dump-times "loop vectorized using 32 byte vectors" 4 "vect" } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c
index 969600885..3a9810827 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c
@@ -1,6 +1,6 @@
/* PR tree-optimization/88464 */
/* { dg-do compile } */
-/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
+/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=haswell -fdump-tree-vect-details" } */
/* { dg-final { scan-tree-dump-times "loop vectorized using 16 byte vectors" 4 "vect" } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c
index 6b0c8a859..ac669e048 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c
@@ -1,6 +1,6 @@
/* PR tree-optimization/88464 */
/* { dg-do compile } */
-/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
+/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=haswell -fdump-tree-vect-details" } */
/* { dg-final { scan-tree-dump-times "loop vectorized using 16 byte vectors" 4 "vect" } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c
index 3af568ab3..14a1083b6 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c
@@ -1,6 +1,6 @@
/* PR tree-optimization/88464 */
/* { dg-do compile } */
-/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
+/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=haswell -fdump-tree-vect-details" } */
/* { dg-final { scan-tree-dump-times "loop vectorized using 32 byte vectors" 4 "vect" } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr88531-1b.c b/gcc/testsuite/gcc.target/i386/pr88531-1b.c
index 812c8a10f..e6df789de 100644
--- a/gcc/testsuite/gcc.target/i386/pr88531-1b.c
+++ b/gcc/testsuite/gcc.target/i386/pr88531-1b.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O3 -march=skylake -mfpmath=sse" } */
+/* { dg-options "-O3 -march=skylake -mfpmath=sse -mtune=haswell" } */
#include "pr88531-1a.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr88531-1c.c b/gcc/testsuite/gcc.target/i386/pr88531-1c.c
index 43fc5913e..a093c87c0 100644
--- a/gcc/testsuite/gcc.target/i386/pr88531-1c.c
+++ b/gcc/testsuite/gcc.target/i386/pr88531-1c.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O3 -march=skylake-avx512 -mfpmath=sse" } */
+/* { dg-options "-O3 -march=skylake-avx512 -mfpmath=sse -mtune=haswell" } */
#include "pr88531-1a.c"
--
2.28.0.windows.1

View File

@ -0,0 +1,187 @@
From c269629130cb23252da2db026ce9ed13f57f69f4 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Thu, 10 Aug 2023 16:26:13 +0800
Subject: [PATCH 12/32] Support -m[no-]gather -m[no-]scatter to enable/disable
vectorization for all gather/scatter instructions
Rename original use_gather to use_gather_8parts, Support
-mtune-ctrl={,^}use_gather to set/clear tune features
use_gather_{2parts, 4parts, 8parts}. Support the new option -mgather
as alias of -mtune-ctrl=, use_gather, ^use_gather.
Similar for use_scatter.
gcc/ChangeLog:
* config/i386/i386-builtins.cc
(ix86_vectorize_builtin_gather): Adjust for use_gather_8parts.
* config/i386/i386-options.cc (parse_mtune_ctrl_str):
Set/Clear tune features use_{gather,scatter}_{2parts, 4parts,
8parts} for -mtune-crtl={,^}{use_gather,use_scatter}.
* config/i386/i386.cc (ix86_vectorize_builtin_scatter): Adjust
for use_scatter_8parts
* config/i386/i386.h (TARGET_USE_GATHER): Rename to ..
(TARGET_USE_GATHER_8PARTS): .. this.
(TARGET_USE_SCATTER): Rename to ..
(TARGET_USE_SCATTER_8PARTS): .. this.
* config/i386/x86-tune.def (X86_TUNE_USE_GATHER): Rename to
(X86_TUNE_USE_GATHER_8PARTS): .. this.
(X86_TUNE_USE_SCATTER): Rename to
(X86_TUNE_USE_SCATTER_8PARTS): .. this.
* config/i386/i386.opt: Add new options mgather, mscatter.
(cherry picked from commit b2a927fb5343db363ea4361da0d6bcee227b6737)
---
gcc/config/i386/i386-builtins.cc | 2 +-
gcc/config/i386/i386-options.cc | 54 +++++++++++++++++++++++---------
gcc/config/i386/i386.cc | 2 +-
gcc/config/i386/i386.h | 8 ++---
gcc/config/i386/i386.opt | 4 +++
gcc/config/i386/x86-tune.def | 4 +--
6 files changed, 52 insertions(+), 22 deletions(-)
diff --git a/gcc/config/i386/i386-builtins.cc b/gcc/config/i386/i386-builtins.cc
index 050c6228a..8ed32e14f 100644
--- a/gcc/config/i386/i386-builtins.cc
+++ b/gcc/config/i386/i386-builtins.cc
@@ -1790,7 +1790,7 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype,
? !TARGET_USE_GATHER_2PARTS
: (known_eq (TYPE_VECTOR_SUBPARTS (mem_vectype), 4u)
? !TARGET_USE_GATHER_4PARTS
- : !TARGET_USE_GATHER)))
+ : !TARGET_USE_GATHER_8PARTS)))
return NULL_TREE;
if ((TREE_CODE (index_type) != INTEGER_TYPE
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 9617fc162..3df1f0c41 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -1705,20 +1705,46 @@ parse_mtune_ctrl_str (struct gcc_options *opts, bool dump)
curr_feature_string++;
clear = true;
}
- for (i = 0; i < X86_TUNE_LAST; i++)
- {
- if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
- {
- ix86_tune_features[i] = !clear;
- if (dump)
- fprintf (stderr, "Explicitly %s feature %s\n",
- clear ? "clear" : "set", ix86_tune_feature_names[i]);
- break;
- }
- }
- if (i == X86_TUNE_LAST)
- error ("unknown parameter to option %<-mtune-ctrl%>: %s",
- clear ? curr_feature_string - 1 : curr_feature_string);
+
+ if (!strcmp (curr_feature_string, "use_gather"))
+ {
+ ix86_tune_features[X86_TUNE_USE_GATHER_2PARTS] = !clear;
+ ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS] = !clear;
+ ix86_tune_features[X86_TUNE_USE_GATHER_8PARTS] = !clear;
+ if (dump)
+ fprintf (stderr, "Explicitly %s features use_gather_2parts,"
+ " use_gather_4parts, use_gather_8parts\n",
+ clear ? "clear" : "set");
+
+ }
+ else if (!strcmp (curr_feature_string, "use_scatter"))
+ {
+ ix86_tune_features[X86_TUNE_USE_SCATTER_2PARTS] = !clear;
+ ix86_tune_features[X86_TUNE_USE_SCATTER_4PARTS] = !clear;
+ ix86_tune_features[X86_TUNE_USE_SCATTER_8PARTS] = !clear;
+ if (dump)
+ fprintf (stderr, "Explicitly %s features use_scatter_2parts,"
+ " use_scatter_4parts, use_scatter_8parts\n",
+ clear ? "clear" : "set");
+ }
+ else
+ {
+ for (i = 0; i < X86_TUNE_LAST; i++)
+ {
+ if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
+ {
+ ix86_tune_features[i] = !clear;
+ if (dump)
+ fprintf (stderr, "Explicitly %s feature %s\n",
+ clear ? "clear" : "set", ix86_tune_feature_names[i]);
+ break;
+ }
+ }
+
+ if (i == X86_TUNE_LAST)
+ error ("unknown parameter to option %<-mtune-ctrl%>: %s",
+ clear ? curr_feature_string - 1 : curr_feature_string);
+ }
curr_feature_string = next_feature_string;
}
while (curr_feature_string);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 479fc6010..e75d37023 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -18937,7 +18937,7 @@ ix86_vectorize_builtin_scatter (const_tree vectype,
? !TARGET_USE_SCATTER_2PARTS
: (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
? !TARGET_USE_SCATTER_4PARTS
- : !TARGET_USE_SCATTER))
+ : !TARGET_USE_SCATTER_8PARTS))
return NULL_TREE;
if ((TREE_CODE (index_type) != INTEGER_TYPE
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 688aaabd3..aaa136ba0 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -403,10 +403,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS]
#define TARGET_USE_SCATTER_4PARTS \
ix86_tune_features[X86_TUNE_USE_SCATTER_4PARTS]
-#define TARGET_USE_GATHER \
- ix86_tune_features[X86_TUNE_USE_GATHER]
-#define TARGET_USE_SCATTER \
- ix86_tune_features[X86_TUNE_USE_SCATTER]
+#define TARGET_USE_GATHER_8PARTS \
+ ix86_tune_features[X86_TUNE_USE_GATHER_8PARTS]
+#define TARGET_USE_SCATTER_8PARTS \
+ ix86_tune_features[X86_TUNE_USE_SCATTER_8PARTS]
#define TARGET_FUSE_CMP_AND_BRANCH_32 \
ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32]
#define TARGET_FUSE_CMP_AND_BRANCH_64 \
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 498fb454d..b154110d8 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1222,3 +1222,7 @@ Instructions number above which STFL stall penalty can be compensated.
munroll-only-small-loops
Target Var(ix86_unroll_only_small_loops) Init(0) Save
Enable conservative small loop unrolling.
+
+mscatter
+Target Alias(mtune-ctrl=, use_scatter, ^use_scatter)
+Enable vectorization for scatter instruction.
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 4392709fc..bdb455d20 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -488,13 +488,13 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
/* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
elements. */
-DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
+DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE
| m_GENERIC | m_GDS))
/* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
elements. */
-DEF_TUNE (X86_TUNE_USE_SCATTER, "use_scatter",
+DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
~(m_ZNVER4))
/* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
--
2.28.0.windows.1

View File

@ -0,0 +1,129 @@
From 764518a35e90a3e13c469275da9c3c7002fe1982 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Fri, 8 Sep 2023 09:22:43 +0800
Subject: [PATCH 13/32] Remove constraint modifier % for
fcmaddcph/fmaddcph/fcmulcph since there're not commutative.
gcc/ChangeLog:
PR target/111306
PR target/111335
* config/i386/sse.md (int_comm): New int_attr.
(fma_<complexopname>_<mode><sdc_maskz_name><round_name>):
Remove % for Complex conjugate operations since they're not
commutative.
(fma_<complexpairopname>_<mode>_pair): Ditto.
(<avx512>_<complexopname>_<mode>_mask<round_name>): Ditto.
(cmul<conj_op><mode>3): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr111306.c: New test.
(cherry picked from commit f197392a16ffb1327f1d12ff8ff05f9295e015cb)
---
gcc/config/i386/sse.md | 16 ++++++++---
gcc/testsuite/gcc.target/i386/pr111306.c | 36 ++++++++++++++++++++++++
2 files changed, 48 insertions(+), 4 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr111306.c
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 3af159896..f25dd5f2b 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -6318,6 +6318,14 @@
[(UNSPEC_COMPLEX_FMA_PAIR "fmaddc")
(UNSPEC_COMPLEX_FCMA_PAIR "fcmaddc")])
+(define_int_attr int_comm
+ [(UNSPEC_COMPLEX_FMA "")
+ (UNSPEC_COMPLEX_FMA_PAIR "")
+ (UNSPEC_COMPLEX_FCMA "")
+ (UNSPEC_COMPLEX_FCMA_PAIR "")
+ (UNSPEC_COMPLEX_FMUL "%")
+ (UNSPEC_COMPLEX_FCMUL "")])
+
(define_int_attr conj_op
[(UNSPEC_COMPLEX_FMA "")
(UNSPEC_COMPLEX_FCMA "_conj")
@@ -6431,7 +6439,7 @@
(define_insn "fma_<complexopname>_<mode><sdc_maskz_name><round_name>"
[(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
(unspec:VF_AVX512FP16VL
- [(match_operand:VF_AVX512FP16VL 1 "<round_nimm_predicate>" "%v")
+ [(match_operand:VF_AVX512FP16VL 1 "<round_nimm_predicate>" "<int_comm>v")
(match_operand:VF_AVX512FP16VL 2 "<round_nimm_predicate>" "<round_constraint>")
(match_operand:VF_AVX512FP16VL 3 "<round_nimm_predicate>" "0")]
UNSPEC_COMPLEX_F_C_MA))]
@@ -6495,7 +6503,7 @@
(define_insn "fma_<complexpairopname>_<mode>_pair"
[(set (match_operand:VF1_AVX512VL 0 "register_operand" "=&v")
(unspec:VF1_AVX512VL
- [(match_operand:VF1_AVX512VL 1 "vector_operand" "%v")
+ [(match_operand:VF1_AVX512VL 1 "vector_operand" "<int_comm>v")
(match_operand:VF1_AVX512VL 2 "bcst_vector_operand" "vmBr")
(match_operand:VF1_AVX512VL 3 "vector_operand" "0")]
UNSPEC_COMPLEX_F_C_MA_PAIR))]
@@ -6562,7 +6570,7 @@
[(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
(vec_merge:VF_AVX512FP16VL
(unspec:VF_AVX512FP16VL
- [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v")
+ [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "<int_comm>v")
(match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")
(match_operand:VF_AVX512FP16VL 3 "register_operand" "0")]
UNSPEC_COMPLEX_F_C_MA)
@@ -6586,7 +6594,7 @@
(define_insn "<avx512>_<complexopname>_<mode><maskc_name><round_name>"
[(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
(unspec:VF_AVX512FP16VL
- [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v")
+ [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "<int_comm>v")
(match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")]
UNSPEC_COMPLEX_F_C_MUL))]
"TARGET_AVX512FP16 && <round_mode512bit_condition>"
diff --git a/gcc/testsuite/gcc.target/i386/pr111306.c b/gcc/testsuite/gcc.target/i386/pr111306.c
new file mode 100644
index 000000000..541725ebd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr111306.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+/* { dg-require-effective-target avx512fp16 } */
+
+#define AVX512FP16
+#include "avx512f-helper.h"
+
+__attribute__((optimize("O2"),noipa))
+void func1(_Float16 *a, _Float16 *b, int n, _Float16 *c) {
+ __m512h rA = _mm512_loadu_ph(a);
+ for (int i = 0; i < n; i += 32) {
+ __m512h rB = _mm512_loadu_ph(b + i);
+ _mm512_storeu_ph(c + i, _mm512_fcmul_pch(rB, rA));
+ }
+}
+
+void
+test_512 (void)
+{
+ int n = 32;
+ _Float16 a[n], b[n], c[n];
+ _Float16 exp[n];
+ for (int i = 1; i <= n; i++) {
+ a[i - 1] = i & 1 ? -i : i;
+ b[i - 1] = i;
+ }
+
+ func1(a, b, n, c);
+ for (int i = 0; i < n / 32; i += 2) {
+ if (c[i] != a[i] * b[i] + a[i+1] * b[i+1]
+ || c[i+1] != a[i] * b[i+1] - a[i+1]*b[i])
+ __builtin_abort ();
+ }
+}
+
+
--
2.28.0.windows.1

View File

@ -0,0 +1,106 @@
From afd539adfe762adb57863299a11987b7e20e7987 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 5 Jul 2023 13:45:11 +0800
Subject: [PATCH 14/32] Disparage slightly for the alternative which move
DFmode between SSE_REGS and GENERAL_REGS.
For testcase
void __cond_swap(double* __x, double* __y) {
bool __r = (*__x < *__y);
auto __tmp = __r ? *__x : *__y;
*__y = __r ? *__y : *__x;
*__x = __tmp;
}
GCC-14 with -O2 and -march=x86-64 options generates the following code:
__cond_swap(double*, double*):
movsd xmm1, QWORD PTR [rdi]
movsd xmm0, QWORD PTR [rsi]
comisd xmm0, xmm1
jbe .L2
movq rax, xmm1
movapd xmm1, xmm0
movq xmm0, rax
.L2:
movsd QWORD PTR [rsi], xmm1
movsd QWORD PTR [rdi], xmm0
ret
rax is used to save and restore DFmode value. In RA both GENERAL_REGS
and SSE_REGS cost zero since we didn't disparage the
alternative in movdf_internal pattern, according to register
allocation order, GENERAL_REGS is allocated. The patch add ? for
alternative (r,v) and (v,r) just like we did for movsf/hf/bf_internal
pattern, after that we get optimal RA.
__cond_swap:
.LFB0:
.cfi_startproc
movsd (%rdi), %xmm1
movsd (%rsi), %xmm0
comisd %xmm1, %xmm0
jbe .L2
movapd %xmm1, %xmm2
movapd %xmm0, %xmm1
movapd %xmm2, %xmm0
.L2:
movsd %xmm1, (%rsi)
movsd %xmm0, (%rdi)
ret
gcc/ChangeLog:
PR target/110170
* config/i386/i386.md (movdf_internal): Disparage slightly for
2 alternatives (r,v) and (v,r) by adding constraint modifier
'?'.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr110170-3.c: New test.
(cherry picked from commit 37a231cc7594d12ba0822077018aad751a6fb94e)
---
gcc/config/i386/i386.md | 4 ++--
gcc/testsuite/gcc.target/i386/pr110170-3.c | 11 +++++++++++
2 files changed, 13 insertions(+), 2 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr110170-3.c
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index be07be10d..71691f598 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -3582,9 +3582,9 @@
;; Possible store forwarding (partial memory) stall in alternatives 4, 6 and 7.
(define_insn "*movdf_internal"
[(set (match_operand:DF 0 "nonimmediate_operand"
- "=Yf*f,m ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,r ,v,r ,o ,r ,m")
+ "=Yf*f,m ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,?r,?v,r ,o ,r ,m")
(match_operand:DF 1 "general_operand"
- "Yf*fm,Yf*f,G ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x,v,r ,roF,rF,rmF,rC"))]
+ "Yf*fm,Yf*f,G ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x, v, r,roF,rF,rmF,rC"))]
"!(MEM_P (operands[0]) && MEM_P (operands[1]))
&& (lra_in_progress || reload_completed
|| !CONST_DOUBLE_P (operands[1])
diff --git a/gcc/testsuite/gcc.target/i386/pr110170-3.c b/gcc/testsuite/gcc.target/i386/pr110170-3.c
new file mode 100644
index 000000000..70daa89e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110170-3.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -fno-if-conversion -fno-if-conversion2" } */
+/* { dg-final { scan-assembler-not {(?n)movq.*r} } } */
+
+void __cond_swap(double* __x, double* __y) {
+ _Bool __r = (*__x < *__y);
+ double __tmp = __r ? *__x : *__y;
+ *__y = __r ? *__y : *__x;
+ *__x = __tmp;
+}
+
--
2.28.0.windows.1

View File

@ -0,0 +1,163 @@
From 88516507757932c1e67ce99d240596935971d2d0 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Thu, 9 Nov 2023 13:20:05 +0800
Subject: [PATCH 15/32] Fix wrong code due to vec_merge + pcmp to blendvb
splitter.
gcc/ChangeLog:
PR target/112443
* config/i386/sse.md (*avx2_pcmp<mode>3_4): Fix swap condition
from LT to GT since there's not in the pattern.
(*avx2_pcmp<mode>3_5): Ditto.
gcc/testsuite/ChangeLog:
* g++.target/i386/pr112443.C: New test.
(cherry picked from commit 9a0cc04b9c9b02426762892b88efc5c44ba546bd)
---
gcc/config/i386/sse.md | 4 +-
gcc/testsuite/g++.target/i386/pr112443.C | 108 +++++++++++++++++++++++
2 files changed, 110 insertions(+), 2 deletions(-)
create mode 100644 gcc/testsuite/g++.target/i386/pr112443.C
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index f25dd5f2b..23b858ab2 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -16358,7 +16358,7 @@
(match_dup 4))]
UNSPEC_BLENDV))]
{
- if (INTVAL (operands[5]) == 1)
+ if (INTVAL (operands[5]) == 5)
std::swap (operands[1], operands[2]);
operands[3] = gen_lowpart (<MODE>mode, operands[3]);
})
@@ -16388,7 +16388,7 @@
(match_dup 4))]
UNSPEC_BLENDV))]
{
- if (INTVAL (operands[5]) == 1)
+ if (INTVAL (operands[5]) == 5)
std::swap (operands[1], operands[2]);
})
diff --git a/gcc/testsuite/g++.target/i386/pr112443.C b/gcc/testsuite/g++.target/i386/pr112443.C
new file mode 100644
index 000000000..ebfa9b4a7
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr112443.C
@@ -0,0 +1,108 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-options "-O2 -std=c++17 -mavx512bw -mavx512vl" } */
+
+#include <cstdint>
+#include <x86intrin.h>
+#include <functional>
+#include <ostream>
+
+#define AVX512BW
+#define AVX512VL
+
+#include "avx512f-helper.h"
+
+struct TensorIteratorBase{
+ char* in;
+ char* out;
+
+ void for_each(std::function<void(char*, char*, int64_t size)> loop){
+ loop(out, in, 32);
+ }
+};
+
+class Vectorized {
+protected:
+ __m256i values;
+
+ static inline __m256i invert(const __m256i& v) {
+ const auto ones = _mm256_set1_epi64x(-1);
+ return _mm256_xor_si256(ones, v);
+ }
+public:
+ operator __m256i() const {
+ return values;
+ }
+
+ static constexpr int size() {
+ return 32;
+ }
+
+ Vectorized() {}
+ Vectorized(__m256i v) : values(v) {}
+ Vectorized(uint8_t v) { values = _mm256_set1_epi8(v); }
+ static Vectorized blendv(const Vectorized& a, const Vectorized& b,
+ const Vectorized& mask) {
+ return _mm256_blendv_epi8(a, b, mask);
+ }
+ static Vectorized loadu(const void* ptr) {
+ return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+ }
+ void store(void* ptr) const {
+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
+ }
+
+ Vectorized operator<(const Vectorized& other) const {
+ __m256i max = _mm256_max_epu8(values, other);
+ return invert(_mm256_cmpeq_epi8(max, values));
+ }
+ Vectorized operator-(const Vectorized& b) {
+ return _mm256_sub_epi8(values, b);
+ }
+};
+
+std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) {
+ uint8_t buf[Vectorized::size()];
+ vec.store(buf);
+ stream << "vec[";
+ for (int i = 0; i != Vectorized::size(); i++) {
+ if (i != 0)
+ stream << ", ";
+ stream << buf[i]*1;
+ }
+ stream << "]";
+ return stream;
+}
+
+void run(TensorIteratorBase iter){
+ Vectorized zero_vec(0);
+ Vectorized one_vec(1);
+
+ iter.for_each([=](char* out, char* in, int64_t size) {
+ for (int64_t i = 0; i <= size - Vectorized::size(); i += Vectorized::size()) {
+ auto self_vec = Vectorized::loadu(in + i);
+ auto left = Vectorized::blendv(zero_vec, one_vec, zero_vec < self_vec);
+ auto right = Vectorized::blendv(zero_vec, one_vec, self_vec < zero_vec);
+ auto outv = left - right;
+ outv.store(out + i);
+ }
+ });
+}
+
+void
+test_256 (){
+ char in[32];
+ char out[32];
+ for(auto& x: in) x = 1;
+ run(TensorIteratorBase{in, out});
+ Vectorized::loadu (out);
+ for (int i = 0; i != 32; i++)
+ if (out[i] != 1)
+ __builtin_abort ();
+}
+
+void
+test_128 ()
+{
+}
--
2.28.0.windows.1

View File

@ -0,0 +1,151 @@
From 204ffa7f503411ccac0161c951726274648b6374 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Thu, 7 Dec 2023 09:17:27 +0800
Subject: [PATCH 16/32] Don't assume it's AVX_U128_CLEAN after call_insn whose
abi.mode_clobber(V4DImode) deosn't contains all SSE_REGS.
If the function desn't clobber any sse registers or only clobber
128-bit part, then vzeroupper isn't issued before the function exit.
the status not CLEAN but ANY after the function.
Also for sibling_call, it's safe to issue an vzeroupper. Also there
could be missing vzeroupper since there's no mode_exit for
sibling_call_p.
gcc/ChangeLog:
PR target/112891
* config/i386/i386.cc (ix86_avx_u128_mode_after): Return
AVX_U128_ANY if callee_abi doesn't clobber all_sse_regs to
align with ix86_avx_u128_mode_needed.
(ix86_avx_u128_mode_needed): Return AVX_U128_ClEAN for
sibling_call.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr112891.c: New test.
* gcc.target/i386/pr112891-2.c: New test.
(cherry picked from commit fc189a08f5b7ad5889bd4c6b320c1dd99dd5d642)
---
gcc/config/i386/i386.cc | 22 +++++++++++++---
gcc/testsuite/gcc.target/i386/pr112891-2.c | 30 ++++++++++++++++++++++
gcc/testsuite/gcc.target/i386/pr112891.c | 29 +++++++++++++++++++++
3 files changed, 78 insertions(+), 3 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr112891-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr112891.c
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index e75d37023..60f3296b0 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -14416,8 +14416,12 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
modes wider than 256 bits. It's only safe to issue a
vzeroupper if all SSE registers are clobbered. */
const function_abi &abi = insn_callee_abi (insn);
- if (!hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
- abi.mode_clobbers (V4DImode)))
+ /* Should be safe to issue an vzeroupper before sibling_call_p.
+ Also there not mode_exit for sibling_call, so there could be
+ missing vzeroupper for that. */
+ if (!(SIBLING_CALL_P (insn)
+ || hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
+ abi.mode_clobbers (V4DImode))))
return AVX_U128_ANY;
return AVX_U128_CLEAN;
@@ -14555,7 +14559,19 @@ ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
bool avx_upper_reg_found = false;
note_stores (insn, ix86_check_avx_upper_stores, &avx_upper_reg_found);
- return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
+ if (avx_upper_reg_found)
+ return AVX_U128_DIRTY;
+
+ /* If the function desn't clobber any sse registers or only clobber
+ 128-bit part, Then vzeroupper isn't issued before the function exit.
+ the status not CLEAN but ANY after the function. */
+ const function_abi &abi = insn_callee_abi (insn);
+ if (!(SIBLING_CALL_P (insn)
+ || hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
+ abi.mode_clobbers (V4DImode))))
+ return AVX_U128_ANY;
+
+ return AVX_U128_CLEAN;
}
/* Otherwise, return current mode. Remember that if insn
diff --git a/gcc/testsuite/gcc.target/i386/pr112891-2.c b/gcc/testsuite/gcc.target/i386/pr112891-2.c
new file mode 100644
index 000000000..164c3985d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr112891-2.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O3" } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
+
+void
+__attribute__((noinline))
+bar (double* a)
+{
+ a[0] = 1.0;
+ a[1] = 2.0;
+}
+
+double
+__attribute__((noinline))
+foo (double* __restrict a, double* b)
+{
+ a[0] += b[0];
+ a[1] += b[1];
+ a[2] += b[2];
+ a[3] += b[3];
+ bar (b);
+ return a[5] + b[5];
+}
+
+double
+foo1 (double* __restrict a, double* b)
+{
+ double c = foo (a, b);
+ return __builtin_exp (c);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr112891.c b/gcc/testsuite/gcc.target/i386/pr112891.c
new file mode 100644
index 000000000..dbf6c6794
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr112891.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O3" } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
+
+void
+__attribute__((noinline))
+bar (double* a)
+{
+ a[0] = 1.0;
+ a[1] = 2.0;
+}
+
+void
+__attribute__((noinline))
+foo (double* __restrict a, double* b)
+{
+ a[0] += b[0];
+ a[1] += b[1];
+ a[2] += b[2];
+ a[3] += b[3];
+ bar (b);
+}
+
+double
+foo1 (double* __restrict a, double* b)
+{
+ foo (a, b);
+ return __builtin_exp (b[1]);
+}
--
2.28.0.windows.1

View File

@ -0,0 +1,142 @@
From 19ee37b11702c86d7ed271e9e1d00e23cc4ab93c Mon Sep 17 00:00:00 2001
From: Jan Hubicka <jh@suse.cz>
Date: Fri, 29 Dec 2023 23:51:03 +0100
Subject: [PATCH 17/32] Disable FMADD in chains for Zen4 and generic
this patch disables use of FMA in matrix multiplication loop for generic (for
x86-64-v3) and zen4. I tested this on zen4 and Xenon Gold Gold 6212U.
For Intel this is neutral both on the matrix multiplication microbenchmark
(attached) and spec2k17 where the difference was within noise for Core.
On core the micro-benchmark runs as follows:
With FMA:
578,500,241 cycles:u # 3.645 GHz
( +- 0.12% )
753,318,477 instructions:u # 1.30 insn per
cycle ( +- 0.00% )
125,417,701 branches:u # 790.227 M/sec
( +- 0.00% )
0.159146 +- 0.000363 seconds time elapsed ( +- 0.23% )
No FMA:
577,573,960 cycles:u # 3.514 GHz
( +- 0.15% )
878,318,479 instructions:u # 1.52 insn per
cycle ( +- 0.00% )
125,417,702 branches:u # 763.035 M/sec
( +- 0.00% )
0.164734 +- 0.000321 seconds time elapsed ( +- 0.19% )
So the cycle count is unchanged and discrete multiply+add takes same time as
FMA.
While on zen:
With FMA:
484875179 cycles:u # 3.599 GHz
( +- 0.05% ) (82.11%)
752031517 instructions:u # 1.55 insn per
cycle
125106525 branches:u # 928.712 M/sec
( +- 0.03% ) (85.09%)
128356 branch-misses:u # 0.10% of all
branches ( +- 0.06% ) (83.58%)
No FMA:
375875209 cycles:u # 3.592 GHz
( +- 0.08% ) (80.74%)
875725341 instructions:u # 2.33 insn per
cycle
124903825 branches:u # 1.194 G/sec
( +- 0.04% ) (84.59%)
0.105203 +- 0.000188 seconds time elapsed ( +- 0.18% )
The diffrerence is that Cores understand the fact that fmadd does not need
all three parameters to start computation, while Zen cores doesn't.
Since this seems noticeable win on zen and not loss on Core it seems like good
default for generic.
float a[SIZE][SIZE];
float b[SIZE][SIZE];
float c[SIZE][SIZE];
void init(void)
{
int i, j, k;
for(i=0; i<SIZE; ++i)
{
for(j=0; j<SIZE; ++j)
{
a[i][j] = (float)i + j;
b[i][j] = (float)i - j;
c[i][j] = 0.0f;
}
}
}
void mult(void)
{
int i, j, k;
for(i=0; i<SIZE; ++i)
{
for(j=0; j<SIZE; ++j)
{
for(k=0; k<SIZE; ++k)
{
c[i][j] += a[i][k] * b[k][j];
}
}
}
}
int main(void)
{
clock_t s, e;
init();
s=clock();
mult();
e=clock();
printf(" mult took %10d clocks\n", (int)(e-s));
return 0;
}
gcc/ChangeLog:
* config/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS,
X86_TUNE_AVOID_256FMA_CHAINS): Enable for znver4 and Core.
---
gcc/config/i386/x86-tune.def | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index bdb455d20..fd095f3ec 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -499,12 +499,13 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
/* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
smaller FMA chain. */
-DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3)
+DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2
+ | m_ZNVER3 | m_ZNVER4 | m_GENERIC)
/* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
smaller FMA chain. */
DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3
- | m_ALDERLAKE | m_SAPPHIRERAPIDS)
+ | m_ZNVER4 | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_GENERIC)
/* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
smaller FMA chain. */
--
2.28.0.windows.1

View File

@ -0,0 +1,47 @@
From 411d1f0bcc0d1c8018fdf5fe84ad2404929556ec Mon Sep 17 00:00:00 2001
From: Haochen Jiang <haochen.jiang@intel.com>
Date: Fri, 16 Sep 2022 13:59:01 +0800
Subject: [PATCH 18/32] Initial Raptorlake Support
gcc/ChangeLog:
* common/config/i386/cpuinfo.h:
(get_intel_cpu): Handle Raptorlake.
* common/config/i386/i386-common.cc:
(processor_alias_table): Add Raptorlake.
(cherry picked from commit 470a0659b508d684148f362c4dc0eccf5a83a23e)
---
gcc/common/config/i386/cpuinfo.h | 2 ++
gcc/common/config/i386/i386-common.cc | 2 ++
2 files changed, 4 insertions(+)
diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 316ad3cb3..13d0f4cd8 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -508,6 +508,8 @@ get_intel_cpu (struct __processor_model *cpu_model,
case 0x97:
case 0x9a:
/* Alder Lake. */
+ case 0xb7:
+ /* Raptor Lake. */
cpu = "alderlake";
CHECK___builtin_cpu_is ("corei7");
CHECK___builtin_cpu_is ("alderlake");
diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
index f650e255f..c1d700f89 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -1939,6 +1939,8 @@ const pta processor_alias_table[] =
M_CPU_SUBTYPE (INTEL_COREI7_SAPPHIRERAPIDS), P_PROC_AVX512F},
{"alderlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE,
M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2},
+ {"raptorlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE,
+ M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2},
{"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL,
M_CPU_TYPE (INTEL_BONNELL), P_PROC_SSSE3},
{"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL,
--
2.28.0.windows.1

View File

@ -0,0 +1,49 @@
From 87cea29ede520f4a5af01dff7071ab1d23bd47b5 Mon Sep 17 00:00:00 2001
From: "Hu, Lin1" <lin1.hu@intel.com>
Date: Fri, 16 Sep 2022 11:25:13 +0800
Subject: [PATCH 19/32] Initial Meteorlake Support
gcc/ChangeLog:
* common/config/i386/cpuinfo.h:
(get_intel_cpu): Handle Meteorlake.
* common/config/i386/i386-common.cc:
(processor_alias_table): Add Meteorlake.
(cherry picked from commit fd206f0e95fb6f41b96eaaaab1dc0c30378e5e08)
---
gcc/common/config/i386/cpuinfo.h | 4 ++++
gcc/common/config/i386/i386-common.cc | 2 ++
2 files changed, 6 insertions(+)
diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 13d0f4cd8..37af92d6b 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -510,6 +510,10 @@ get_intel_cpu (struct __processor_model *cpu_model,
/* Alder Lake. */
case 0xb7:
/* Raptor Lake. */
+ case 0xb5:
+ case 0xaa:
+ case 0xac:
+ /* Meteor Lake. */
cpu = "alderlake";
CHECK___builtin_cpu_is ("corei7");
CHECK___builtin_cpu_is ("alderlake");
diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
index c1d700f89..cfee672fb 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -1941,6 +1941,8 @@ const pta processor_alias_table[] =
M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2},
{"raptorlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE,
M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2},
+ {"meteorlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE,
+ M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2},
{"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL,
M_CPU_TYPE (INTEL_BONNELL), P_PROC_SSSE3},
{"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL,
--
2.28.0.windows.1

View File

@ -0,0 +1,691 @@
From c11301c7780213ddf46a0bcdb06079af485f431c Mon Sep 17 00:00:00 2001
From: Hongyu Wang <hongyu.wang@intel.com>
Date: Fri, 4 Nov 2022 15:50:55 +0800
Subject: [PATCH 20/32] Support Intel AMX-FP16 ISA
gcc/ChangeLog:
* common/config/i386/cpuinfo.h (get_available_features): Detect
amx-fp16.
* common/config/i386/i386-common.cc (OPTION_MASK_ISA2_AMX_FP16_SET,
OPTION_MASK_ISA2_AMX_FP16_UNSET): New macros.
(ix86_handle_option): Handle -mamx-fp16.
* common/config/i386/i386-cpuinfo.h (enum processor_features):
Add FEATURE_AMX_FP16.
* common/config/i386/i386-isas.h: Add ISA_NAME_TABLE_ENTRY for
amx-fp16.
* config.gcc: Add amxfp16intrin.h.
* config/i386/cpuid.h (bit_AMX_FP16): New.
* config/i386/i386-c.cc (ix86_target_macros_internal): Define
__AMX_FP16__.
* config/i386/i386-isa.def: Add DEF_PTA for AMX_FP16.
* config/i386/i386-options.cc (isa2_opts): Add -mamx-fp16.
(ix86_valid_target_attribute_inner_p): Add new ATTR.
(ix86_option_override_internal): Handle AMX-FP16.
* config/i386/i386.opt: Add -mamx-fp16.
* config/i386/immintrin.h: Include amxfp16intrin.h.
* doc/extend.texi: Document -mamx-fp16.
* doc/invoke.texi: Document amx-fp16.
* doc/sourcebuild.texi: Document amx_fp16.
* config/i386/amxfp16intrin.h: New file.
gcc/testsuite/ChangeLog:
* g++.dg/other/i386-2.C: Add -mamx-fp16.
* g++.dg/other/i386-3.C: Ditto.
* gcc.target/i386/sse-12.c: Ditto.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Ditto.
* gcc.target/i386/sse-23.c: Ditto.
* lib/target-supports.exp: (check_effective_target_amx_fp16):
New proc.
* gcc.target/i386/funcspec-56.inc: Add new target attribute.
* gcc.target/i386/amx-check.h: Add AMX_FP16.
* gcc.target/i386/amx-helper.h: New file to support amx-fp16.
* gcc.target/i386/amxfp16-asmatt-1.c: New test.
* gcc.target/i386/amxfp16-asmintel-1.c: Ditto.
* gcc.target/i386/amxfp16-dpfp16ps-2.c: Ditto.
Co-authored-by: Haochen Jiang <haochen.jiang@intel.com>
(cherry picked from commit 2b4a03962a0fe18cadc944d90f1fb85a40004226)
---
gcc/common/config/i386/cpuinfo.h | 5 ++
gcc/common/config/i386/i386-common.cc | 15 +++++
gcc/common/config/i386/i386-cpuinfo.h | 1 +
gcc/common/config/i386/i386-isas.h | 1 +
gcc/config.gcc | 3 +-
gcc/config/i386/amxfp16intrin.h | 46 ++++++++++++++
gcc/config/i386/cpuid.h | 1 +
gcc/config/i386/i386-c.cc | 2 +
gcc/config/i386/i386-isa.def | 1 +
gcc/config/i386/i386-options.cc | 4 +-
gcc/config/i386/i386.opt | 4 ++
gcc/config/i386/immintrin.h | 2 +
gcc/doc/extend.texi | 5 ++
gcc/doc/invoke.texi | 9 ++-
gcc/doc/sourcebuild.texi | 3 +
gcc/testsuite/g++.dg/other/i386-2.C | 2 +-
gcc/testsuite/g++.dg/other/i386-3.C | 2 +-
gcc/testsuite/gcc.target/i386/amx-check.h | 3 +
gcc/testsuite/gcc.target/i386/amx-helper.h | 61 +++++++++++++++++++
.../gcc.target/i386/amxfp16-asmatt-1.c | 13 ++++
.../gcc.target/i386/amxfp16-asmintel-1.c | 10 +++
.../gcc.target/i386/amxfp16-dpfp16ps-2.c | 57 +++++++++++++++++
gcc/testsuite/gcc.target/i386/funcspec-56.inc | 2 +
gcc/testsuite/gcc.target/i386/sse-12.c | 2 +-
gcc/testsuite/gcc.target/i386/sse-13.c | 2 +-
gcc/testsuite/gcc.target/i386/sse-14.c | 2 +-
gcc/testsuite/gcc.target/i386/sse-22.c | 4 +-
gcc/testsuite/gcc.target/i386/sse-23.c | 2 +-
gcc/testsuite/lib/target-supports.exp | 11 ++++
29 files changed, 262 insertions(+), 13 deletions(-)
create mode 100644 gcc/config/i386/amxfp16intrin.h
create mode 100644 gcc/testsuite/gcc.target/i386/amx-helper.h
create mode 100644 gcc/testsuite/gcc.target/i386/amxfp16-asmatt-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/amxfp16-asmintel-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/amxfp16-dpfp16ps-2.c
diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 37af92d6b..5951a30aa 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -783,6 +783,11 @@ get_available_features (struct __processor_model *cpu_model,
set_feature (FEATURE_AVX512BF16);
}
}
+ if (amx_usable)
+ {
+ if (eax & bit_AMX_FP16)
+ set_feature (FEATURE_AMX_FP16);
+ }
}
/* Get Advanced Features at level 0xd (eax = 0xd, ecx = 1). */
diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
index cfee672fb..922db33ee 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -107,6 +107,7 @@ along with GCC; see the file COPYING3. If not see
#define OPTION_MASK_ISA2_AMX_TILE_SET OPTION_MASK_ISA2_AMX_TILE
#define OPTION_MASK_ISA2_AMX_INT8_SET OPTION_MASK_ISA2_AMX_INT8
#define OPTION_MASK_ISA2_AMX_BF16_SET OPTION_MASK_ISA2_AMX_BF16
+#define OPTION_MASK_ISA2_AMX_FP16_SET OPTION_MASK_ISA2_AMX_FP16
/* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
as -msse4.2. */
@@ -275,6 +276,7 @@ along with GCC; see the file COPYING3. If not see
#define OPTION_MASK_ISA2_KL_UNSET \
(OPTION_MASK_ISA2_KL | OPTION_MASK_ISA2_WIDEKL_UNSET)
#define OPTION_MASK_ISA2_WIDEKL_UNSET OPTION_MASK_ISA2_WIDEKL
+#define OPTION_MASK_ISA2_AMX_FP16_UNSET OPTION_MASK_ISA2_AMX_FP16
/* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same
as -mno-sse4.1. */
@@ -1125,6 +1127,19 @@ ix86_handle_option (struct gcc_options *opts,
}
return true;
+ case OPT_mamx_fp16:
+ if (value)
+ {
+ opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AMX_FP16_SET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AMX_FP16_SET;
+ }
+ else
+ {
+ opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AMX_FP16_UNSET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AMX_FP16_UNSET;
+ }
+ return true;
+
case OPT_mfma:
if (value)
{
diff --git a/gcc/common/config/i386/i386-cpuinfo.h b/gcc/common/config/i386/i386-cpuinfo.h
index 82996ebb3..8f22897de 100644
--- a/gcc/common/config/i386/i386-cpuinfo.h
+++ b/gcc/common/config/i386/i386-cpuinfo.h
@@ -240,6 +240,7 @@ enum processor_features
FEATURE_X86_64_V2,
FEATURE_X86_64_V3,
FEATURE_X86_64_V4,
+ FEATURE_AMX_FP16,
CPU_FEATURE_MAX
};
diff --git a/gcc/common/config/i386/i386-isas.h b/gcc/common/config/i386/i386-isas.h
index 2d0646a68..95bab6da2 100644
--- a/gcc/common/config/i386/i386-isas.h
+++ b/gcc/common/config/i386/i386-isas.h
@@ -175,4 +175,5 @@ ISA_NAMES_TABLE_START
ISA_NAMES_TABLE_ENTRY("x86-64-v2", FEATURE_X86_64_V2, P_X86_64_V2, NULL)
ISA_NAMES_TABLE_ENTRY("x86-64-v3", FEATURE_X86_64_V3, P_X86_64_V3, NULL)
ISA_NAMES_TABLE_ENTRY("x86-64-v4", FEATURE_X86_64_V4, P_X86_64_V4, NULL)
+ ISA_NAMES_TABLE_ENTRY("amx-fp16", FEATURE_AMX_FP16, P_NONE, "-mamx-fp16")
ISA_NAMES_TABLE_END
diff --git a/gcc/config.gcc b/gcc/config.gcc
index 4a0ae9328..e2b4a23dc 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -423,7 +423,8 @@ i[34567]86-*-* | x86_64-*-*)
tsxldtrkintrin.h amxtileintrin.h amxint8intrin.h
amxbf16intrin.h x86gprintrin.h uintrintrin.h
hresetintrin.h keylockerintrin.h avxvnniintrin.h
- mwaitintrin.h avx512fp16intrin.h avx512fp16vlintrin.h"
+ mwaitintrin.h avx512fp16intrin.h avx512fp16vlintrin.h
+ amxfp16intrin.h"
;;
ia64-*-*)
extra_headers=ia64intrin.h
diff --git a/gcc/config/i386/amxfp16intrin.h b/gcc/config/i386/amxfp16intrin.h
new file mode 100644
index 000000000..6a114741a
--- /dev/null
+++ b/gcc/config/i386/amxfp16intrin.h
@@ -0,0 +1,46 @@
+/* Copyright (C) 2020 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <amxfp16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AMXFP16INTRIN_H_INCLUDED
+#define _AMXFP16INTRIN_H_INCLUDED
+
+#if defined(__x86_64__)
+#define _tile_dpfp16ps_internal(dst,src1,src2) \
+ __asm__ volatile \
+ ("{tdpfp16ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpfp16ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+
+#define _tile_dpfp16ps(dst,src1,src2) \
+ _tile_dpfp16ps_internal (dst,src1,src2)
+
+#endif
+
+#ifdef __DISABLE_AMX_FP16__
+#undef __DISABLE_AMX_FP16__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_FP16__ */
+
+#endif /* _AMXFP16INTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index 8b3dc2b1d..d6cd8d1bf 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -27,6 +27,7 @@
/* %eax */
#define bit_AVXVNNI (1 << 4)
#define bit_AVX512BF16 (1 << 5)
+#define bit_AMX_FP16 (1 << 21)
#define bit_HRESET (1 << 22)
/* %ecx */
diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc
index 3fec4c7e2..4269f29e6 100644
--- a/gcc/config/i386/i386-c.cc
+++ b/gcc/config/i386/i386-c.cc
@@ -633,6 +633,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
def_or_undef (parse_in, "__WIDEKL__");
if (isa_flag2 & OPTION_MASK_ISA2_AVXVNNI)
def_or_undef (parse_in, "__AVXVNNI__");
+ if (isa_flag2 & OPTION_MASK_ISA2_AMX_FP16)
+ def_or_undef (parse_in, "__AMX_FP16__");
if (TARGET_IAMCU)
{
def_or_undef (parse_in, "__iamcu");
diff --git a/gcc/config/i386/i386-isa.def b/gcc/config/i386/i386-isa.def
index 83659d0be..c7305c01b 100644
--- a/gcc/config/i386/i386-isa.def
+++ b/gcc/config/i386/i386-isa.def
@@ -109,3 +109,4 @@ DEF_PTA(KL)
DEF_PTA(WIDEKL)
DEF_PTA(AVXVNNI)
DEF_PTA(AVX512FP16)
+DEF_PTA(AMX_FP16)
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 3df1f0c41..3edb7094e 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -230,7 +230,8 @@ static struct ix86_target_opts isa2_opts[] =
{ "-mkl", OPTION_MASK_ISA2_KL },
{ "-mwidekl", OPTION_MASK_ISA2_WIDEKL },
{ "-mavxvnni", OPTION_MASK_ISA2_AVXVNNI },
- { "-mavx512fp16", OPTION_MASK_ISA2_AVX512FP16 }
+ { "-mavx512fp16", OPTION_MASK_ISA2_AVX512FP16 },
+ { "-mamx-fp16", OPTION_MASK_ISA2_AMX_FP16 }
};
static struct ix86_target_opts isa_opts[] =
{
@@ -1074,6 +1075,7 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
IX86_ATTR_ISA ("hreset", OPT_mhreset),
IX86_ATTR_ISA ("avxvnni", OPT_mavxvnni),
IX86_ATTR_ISA ("avx512fp16", OPT_mavx512fp16),
+ IX86_ATTR_ISA ("amx-fp16", OPT_mamx_fp16),
/* enum options */
IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index b154110d8..52c6f02ee 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1226,3 +1226,7 @@ Enable conservative small loop unrolling.
mscatter
Target Alias(mtune-ctrl=, use_scatter, ^use_scatter)
Enable vectorization for scatter instruction.
+
+mamx-fp16
+Target Mask(ISA2_AMX_FP16) Var(ix86_isa_flags2) Save
+Support AMX-FP16 built-in functions and code generation.
diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h
index 6afd78c2b..0447ca4b2 100644
--- a/gcc/config/i386/immintrin.h
+++ b/gcc/config/i386/immintrin.h
@@ -128,4 +128,6 @@
#include <keylockerintrin.h>
+#include <amxfp16intrin.h>
+
#endif /* _IMMINTRIN_H_INCLUDED */
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 33a776a79..4ba9d34cd 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -7038,6 +7038,11 @@ Enable/disable the generation of the WIDEKL instructions.
@cindex @code{target("avxvnni")} function attribute, x86
Enable/disable the generation of the AVXVNNI instructions.
+@item amx-fp16
+@itemx no-amx-fp16
+@cindex @code{target("amx-fp16")} function attribute, x86
+Enable/disable the generation of the AMX-FP16 instructions.
+
@item cld
@itemx no-cld
@cindex @code{target("cld")} function attribute, x86
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 3a48655e5..d25f13217 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1428,7 +1428,7 @@ See RS/6000 and PowerPC Options.
-mavx5124fmaps -mavx512vnni -mavx5124vnniw -mprfchw -mrdpid @gol
-mrdseed -msgx -mavx512vp2intersect -mserialize -mtsxldtrk@gol
-mamx-tile -mamx-int8 -mamx-bf16 -muintr -mhreset -mavxvnni@gol
--mavx512fp16 @gol
+-mavx512fp16 -mamx-fp16 @gol
-mcldemote -mms-bitfields -mno-align-stringops -minline-all-stringops @gol
-minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
-mkl -mwidekl @gol
@@ -32442,6 +32442,9 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}.
@need 200
@itemx -mwidekl
@opindex mwidekl
+@need 200
+@itemx -mamx-fp16
+@opindex mamx-fp16
These switches enable the use of instructions in the MMX, SSE,
SSE2, SSE3, SSSE3, SSE4, SSE4A, SSE4.1, SSE4.2, AVX, AVX2, AVX512F, AVX512PF,
AVX512ER, AVX512CD, AVX512VL, AVX512BW, AVX512DQ, AVX512IFMA, AVX512VBMI, SHA,
@@ -32451,8 +32454,8 @@ WBNOINVD, FMA4, PREFETCHW, RDPID, PREFETCHWT1, RDSEED, SGX, XOP, LWP,
XSAVEOPT, XSAVEC, XSAVES, RTM, HLE, TBM, MWAITX, CLZERO, PKU, AVX512VBMI2,
GFNI, VAES, WAITPKG, VPCLMULQDQ, AVX512BITALG, MOVDIRI, MOVDIR64B, AVX512BF16,
ENQCMD, AVX512VPOPCNTDQ, AVX5124FMAPS, AVX512VNNI, AVX5124VNNIW, SERIALIZE,
-UINTR, HRESET, AMXTILE, AMXINT8, AMXBF16, KL, WIDEKL, AVXVNNI, AVX512-FP16
-or CLDEMOTE extended instruction sets. Each has a corresponding
+UINTR, HRESET, AMXTILE, AMXINT8, AMXBF16, KL, WIDEKL, AVXVNNI, AVX512-FP16,
+AMX-FP16 or CLDEMOTE extended instruction sets. Each has a corresponding
@option{-mno-} option to disable use of these instructions.
These extensions are also available as built-in functions: see
diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
index 71c04841d..b64b62dee 100644
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@@ -2472,6 +2472,9 @@ Target supports the execution of @code{amx-int8} instructions.
@item amx_bf16
Target supports the execution of @code{amx-bf16} instructions.
+@item amx_fp16
+Target supports the execution of @code{amx-fp16} instructions.
+
@item cell_hw
Test system can execute AltiVec and Cell PPU instructions.
diff --git a/gcc/testsuite/g++.dg/other/i386-2.C b/gcc/testsuite/g++.dg/other/i386-2.C
index fba3d1ac6..57a6357aa 100644
--- a/gcc/testsuite/g++.dg/other/i386-2.C
+++ b/gcc/testsuite/g++.dg/other/i386-2.C
@@ -1,5 +1,5 @@
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16" } */
+/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mamx-fp16" } */
/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
diff --git a/gcc/testsuite/g++.dg/other/i386-3.C b/gcc/testsuite/g++.dg/other/i386-3.C
index 5cc0fa834..1947547d6 100644
--- a/gcc/testsuite/g++.dg/other/i386-3.C
+++ b/gcc/testsuite/g++.dg/other/i386-3.C
@@ -1,5 +1,5 @@
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16" } */
+/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mamx-fp16" } */
/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
diff --git a/gcc/testsuite/gcc.target/i386/amx-check.h b/gcc/testsuite/gcc.target/i386/amx-check.h
index 6fff5ff46..27dd37bf9 100644
--- a/gcc/testsuite/gcc.target/i386/amx-check.h
+++ b/gcc/testsuite/gcc.target/i386/amx-check.h
@@ -213,6 +213,9 @@ main ()
#ifdef AMX_BF16
&& __builtin_cpu_supports ("amx-bf16")
#endif
+#ifdef AMX_FP16
+ && __builtin_cpu_supports ("amx-fp16")
+#endif
#ifdef __linux__
&& request_perm_xtile_data ()
#endif
diff --git a/gcc/testsuite/gcc.target/i386/amx-helper.h b/gcc/testsuite/gcc.target/i386/amx-helper.h
new file mode 100644
index 000000000..fe24d7067
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/amx-helper.h
@@ -0,0 +1,61 @@
+#ifndef AMX_HELPER_H_INCLUDED
+#define AMX_HELPER_H_INCLUDED
+#if defined(AMX_FP16)
+#include <immintrin.h>
+#include <xmmintrin.h>
+#endif
+#include "amx-check.h"
+
+typedef union
+{
+ _Float16 f16;
+ uint16_t u;
+} union16f_uw;
+
+#if defined(AMX_FP16)
+/* Transformation functions between fp16/float */
+static uint16_t make_f32_fp16 (float f)
+{
+ union16f_uw tmp;
+ __m128 b = _mm_set_ss (f);
+ __m128h a;
+ tmp.f16 = _mm_cvtsh_h (_mm_cvtss_sh (a, b));
+ return tmp.u;
+}
+
+static float make_fp16_f32 (uint16_t fp)
+{
+ union16f_uw tmp;
+ tmp.u = fp;
+ __m128h b = _mm_set_sh (tmp.f16);
+ __m128 a;
+ return _mm_cvtss_f32 (_mm_cvtsh_ss (a, b));
+}
+
+/* Init tile buffer with fp16 pairs */
+void init_fp16_max_tile_buffer (uint8_t* buf)
+{
+ int i, j;
+ uint16_t* ptr = (uint16_t *) buf;
+
+ for (i = 0; i < 16; i++)
+ for (j = 0; j < 32; j++)
+ {
+ float f = 2.5f * i + 1.25f * j;
+ ptr[i * 32 + j] = make_f32_fp16 (f);
+ }
+}
+
+/* Init tile fp16 pair buffer with zero */
+void init_fp16_max_tile_zero_buffer (uint8_t* buf)
+{
+ int i, j;
+ uint16_t* ptr = (uint16_t *) buf;
+
+ for (i = 0; i < 16; i++)
+ for (j = 0; j < 32; j++)
+ ptr[i * 32 + j] = make_f32_fp16 (0.0f);
+}
+#endif
+
+#endif
diff --git a/gcc/testsuite/gcc.target/i386/amxfp16-asmatt-1.c b/gcc/testsuite/gcc.target/i386/amxfp16-asmatt-1.c
new file mode 100644
index 000000000..09ae6d408
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/amxfp16-asmatt-1.c
@@ -0,0 +1,13 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mamx-fp16" } */
+/* { dg-final { scan-assembler "tdpfp16ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */
+#include <immintrin.h>
+
+#define TMM1 1
+#define TMM2 2
+#define TMM3 3
+
+void TEST ()
+{
+ _tile_dpfp16ps (TMM1, TMM2, TMM3);
+}
diff --git a/gcc/testsuite/gcc.target/i386/amxfp16-asmintel-1.c b/gcc/testsuite/gcc.target/i386/amxfp16-asmintel-1.c
new file mode 100644
index 000000000..a8dff945f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/amxfp16-asmintel-1.c
@@ -0,0 +1,10 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-require-effective-target masm_intel } */
+/* { dg-options "-O2 -mamx-fp16 -masm=intel" } */
+/* { dg-final { scan-assembler "tdpfp16ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
+#include <immintrin.h>
+
+void TEST ()
+{
+ _tile_dpfp16ps (1, 2, 3);
+}
diff --git a/gcc/testsuite/gcc.target/i386/amxfp16-dpfp16ps-2.c b/gcc/testsuite/gcc.target/i386/amxfp16-dpfp16ps-2.c
new file mode 100644
index 000000000..2d359a689
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/amxfp16-dpfp16ps-2.c
@@ -0,0 +1,57 @@
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target amx_tile } */
+/* { dg-require-effective-target amx_fp16 } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-options "-O2 -mamx-tile -mamx-fp16 -mavx512fp16" } */
+#define AMX_FP16
+#define DO_TEST test_amx_fp16_dpfp16ps
+void test_amx_fp16_dpfp16ps ();
+#include "amx-helper.h"
+
+void calc_matrix_dpfp16ps (__tile *dst, __tile *src1, __tile *src2)
+{
+ uint16_t *src1_buf = (uint16_t *)src1->buf;
+ uint16_t *src2_buf = (uint16_t *)src2->buf;
+ float *dst_buf = (float *)dst->buf;
+
+ int M = src1->rows;
+ int N = src1->colsb / 4;
+ int K = src2->colsb / 4;
+ int i, j, k, t;
+
+ for (i = 0; i < M; i++)
+ for (j = 0; j < N; j++)
+ for (k = 0; k < K; k++)
+ for (t = 0; t < 2; t+=2)
+ {
+ dst_buf[i * K + k] +=
+ (make_fp16_f32 (src1_buf[i * 2 * N + 2 * j + t]) *
+ make_fp16_f32 (src2_buf[j * 2 * K + 2 * k + t])) +
+ (make_fp16_f32 (src1_buf[i * 2 * N + 2 * j + t + 1]) *
+ make_fp16_f32 (src2_buf[j * 2 * K + 2 * k + t + 1]));
+ }
+
+}
+
+void test_amx_fp16_dpfp16ps ()
+{
+ __tilecfg_u cfg;
+ __tile dst, dst_ref, src1, src2;
+ uint8_t tmp_dst_buf[1024], tmp_dst_zero_buf[1024];
+
+ init_fp16_max_tile_buffer (tmp_dst_buf);
+ init_fp16_max_tile_zero_buffer (tmp_dst_zero_buf);
+
+ init_tile_config (&cfg);
+ init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_zero_buf);
+ init_tile_reg_and_src_with_buffer (2, src1, tmp_dst_buf);
+ init_tile_reg_and_src_with_buffer (3, src2, tmp_dst_buf);
+
+ calc_matrix_dpfp16ps (&dst, &src1, &src2);
+
+ _tile_dpfp16ps (1, 2, 3);
+ _tile_stored (1, dst_ref.buf, _STRIDE);
+
+ if (!check_float_tile_register (&dst_ref, &dst))
+ abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/funcspec-56.inc b/gcc/testsuite/gcc.target/i386/funcspec-56.inc
index f34e7a977..b00cfff03 100644
--- a/gcc/testsuite/gcc.target/i386/funcspec-56.inc
+++ b/gcc/testsuite/gcc.target/i386/funcspec-56.inc
@@ -80,6 +80,7 @@ extern void test_keylocker (void) __attribute__((__target__("kl")));
extern void test_widekl (void) __attribute__((__target__("widekl")));
extern void test_avxvnni (void) __attribute__((__target__("avxvnni")));
extern void test_avx512fp16 (void) __attribute__((__target__("avx512fp16")));
+extern void test_amx_fp16 (void) __attribute__((__target__("amx-fp16")));
extern void test_no_sgx (void) __attribute__((__target__("no-sgx")));
extern void test_no_avx5124fmaps(void) __attribute__((__target__("no-avx5124fmaps")));
@@ -161,6 +162,7 @@ extern void test_no_keylocker (void) __attribute__((__target__("no-kl")));
extern void test_no_widekl (void) __attribute__((__target__("no-widekl")));
extern void test_no_avxvnni (void) __attribute__((__target__("no-avxvnni")));
extern void test_no_avx512fp16 (void) __attribute__((__target__("no-avx512fp16")));
+extern void test_no_amx_fp16 (void) __attribute__((__target__("no-amx-fp16")));
extern void test_arch_nocona (void) __attribute__((__target__("arch=nocona")));
extern void test_arch_core2 (void) __attribute__((__target__("arch=core2")));
diff --git a/gcc/testsuite/gcc.target/i386/sse-12.c b/gcc/testsuite/gcc.target/i386/sse-12.c
index 375d4d1b4..9ab4a7e0c 100644
--- a/gcc/testsuite/gcc.target/i386/sse-12.c
+++ b/gcc/testsuite/gcc.target/i386/sse-12.c
@@ -3,7 +3,7 @@
popcntintrin.h gfniintrin.h and mm_malloc.h are usable
with -O -std=c89 -pedantic-errors. */
/* { dg-do compile } */
-/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512bw -mavx512dq -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni" } */
+/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512bw -mavx512dq -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mamx-fp16" } */
#include <x86intrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/sse-13.c b/gcc/testsuite/gcc.target/i386/sse-13.c
index e285c307d..a1e453a98 100644
--- a/gcc/testsuite/gcc.target/i386/sse-13.c
+++ b/gcc/testsuite/gcc.target/i386/sse-13.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mavx512vp2intersect -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16" } */
+/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mavx512vp2intersect -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mamx-fp16" } */
/* { dg-add-options bind_pic_locally } */
#include <mm_malloc.h>
diff --git a/gcc/testsuite/gcc.target/i386/sse-14.c b/gcc/testsuite/gcc.target/i386/sse-14.c
index f41493b93..eaa1a8d81 100644
--- a/gcc/testsuite/gcc.target/i386/sse-14.c
+++ b/gcc/testsuite/gcc.target/i386/sse-14.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -mavx512vl -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16" } */
+/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -mavx512vl -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mamx-fp16" } */
/* { dg-add-options bind_pic_locally } */
#include <mm_malloc.h>
diff --git a/gcc/testsuite/gcc.target/i386/sse-22.c b/gcc/testsuite/gcc.target/i386/sse-22.c
index 31492ef36..19afe639d 100644
--- a/gcc/testsuite/gcc.target/i386/sse-22.c
+++ b/gcc/testsuite/gcc.target/i386/sse-22.c
@@ -103,7 +103,7 @@
#ifndef DIFFERENT_PRAGMAS
-#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vbmi2,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16")
+#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vbmi2,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,amx-fp16")
#endif
/* Following intrinsics require immediate arguments. They
@@ -220,7 +220,7 @@ test_4 (_mm_cmpestrz, int, __m128i, int, __m128i, int, 1)
/* immintrin.h (AVX/AVX2/RDRND/FSGSBASE/F16C/RTM/AVX512F/SHA) */
#ifdef DIFFERENT_PRAGMAS
-#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,avx512f,avx512er,avx512cd,avx512pf,sha,avx512vl,avx512bw,avx512dq,avx512ifma,avx512vbmi,avx512vbmi2,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16")
+#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,avx512f,avx512er,avx512cd,avx512pf,sha,avx512vl,avx512bw,avx512dq,avx512ifma,avx512vbmi,avx512vbmi2,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,amx-fp16")
#endif
#include <immintrin.h>
test_1 (_cvtss_sh, unsigned short, float, 1)
diff --git a/gcc/testsuite/gcc.target/i386/sse-23.c b/gcc/testsuite/gcc.target/i386/sse-23.c
index b398fd144..151201d97 100644
--- a/gcc/testsuite/gcc.target/i386/sse-23.c
+++ b/gcc/testsuite/gcc.target/i386/sse-23.c
@@ -843,6 +843,6 @@
#define __builtin_ia32_vpclmulqdq_v2di(A, B, C) __builtin_ia32_vpclmulqdq_v2di(A, B, 1)
#define __builtin_ia32_vpclmulqdq_v8di(A, B, C) __builtin_ia32_vpclmulqdq_v8di(A, B, 1)
-#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,xsavec,xsaves,clflushopt,avx512bw,avx512dq,avx512vl,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,avx512vbmi2,vpclmulqdq,avx512bitalg,pconfig,wbnoinvd,avx512bf16,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16")
+#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,xsavec,xsaves,clflushopt,avx512bw,avx512dq,avx512vl,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,avx512vbmi2,vpclmulqdq,avx512bitalg,pconfig,wbnoinvd,avx512bf16,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,amx-fp16")
#include <x86intrin.h>
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index c858bd93b..0d83c780c 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -9972,6 +9972,17 @@ proc check_effective_target_amx_bf16 { } {
} "-mamx-bf16" ]
}
+# Return 1 if amx-fp16 instructions can be compiled.
+proc check_effective_target_amx_fp16 { } {
+ return [check_no_compiler_messages amx_fp16 object {
+ void
+ foo ()
+ {
+ __asm__ volatile ("tdpfp16ps\t%%tmm1, %%tmm2, %%tmm3" ::);
+ }
+ } "-mamx-fp16" ]
+}
+
# Return 1 if vpclmulqdq instructions can be compiled.
proc check_effective_target_vpclmulqdq { } {
return [check_no_compiler_messages vpclmulqdq object {
--
2.28.0.windows.1

View File

@ -0,0 +1,902 @@
From 42a38c8abaa28f67e26b9af3f434fe0107894e7d Mon Sep 17 00:00:00 2001
From: Haochen Jiang <haochen.jiang@intel.com>
Date: Fri, 4 Nov 2022 15:01:05 +0800
Subject: [PATCH 21/32] Support Intel prefetchit0/t1
gcc/ChangeLog:
* common/config/i386/cpuinfo.h (get_available_features):
Detect PREFETCHI.
* common/config/i386/i386-common.cc
(OPTION_MASK_ISA2_PREFETCHI_SET,
OPTION_MASK_ISA2_PREFETCHI_UNSET): New.
(ix86_handle_option): Handle -mprefetchi.
* common/config/i386/i386-cpuinfo.h
(enum processor_features): Add FEATURE_PREFETCHI.
* common/config/i386/i386-isas.h: Add ISA_NAME_TABLE_ENTRY
for prefetchi.
* config.gcc: Add prfchiintrin.h.
* config/i386/cpuid.h (bit_PREFETCHI): New.
* config/i386/i386-builtin-types.def:
Add DEF_FUNCTION_TYPE (VOID, PCVOID, INT)
and DEF_FUNCTION_TYPE (VOID, PCVOID, INT, INT, INT).
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-c.cc (ix86_target_macros_internal):
Define __PREFETCHI__.
* config/i386/i386-expand.cc: Handle new builtins.
* config/i386/i386-isa.def (PREFETCHI):
Add DEF_PTA(PREFETCHI).
* config/i386/i386-options.cc
(ix86_valid_target_attribute_inner_p): Handle prefetchi.
* config/i386/i386.md (prefetchi): New define_insn.
* config/i386/i386.opt: Add option -mprefetchi.
* config/i386/predicates.md (local_func_symbolic_operand):
New predicates.
* config/i386/x86gprintrin.h: Include prfchiintrin.h.
* config/i386/xmmintrin.h (enum _mm_hint): New enum for
prefetchi.
(_mm_prefetch): Handle the highest bit of enum.
* doc/extend.texi: Document prefetchi.
* doc/invoke.texi: Document -mprefetchi.
* doc/sourcebuild.texi: Document target prefetchi.
* config/i386/prfchiintrin.h: New file.
gcc/testsuite/ChangeLog:
* g++.dg/other/i386-2.C: Add -mprefetchi.
* g++.dg/other/i386-3.C: Ditto.
* gcc.target/i386/avx-1.c: Ditto.
* gcc.target/i386/funcspec-56.inc: Add new target attribute.
* gcc.target/i386/sse-13.c: Add -mprefetchi.
* gcc.target/i386/sse-23.c: Ditto.
* gcc.target/i386/x86gprintrin-1.c: Ditto.
* gcc.target/i386/x86gprintrin-2.c: Ditto.
* gcc.target/i386/x86gprintrin-3.c: Ditto.
* gcc.target/i386/x86gprintrin-4.c: Ditto.
* gcc.target/i386/x86gprintrin-5.c: Ditto.
* gcc.target/i386/prefetchi-1.c: New test.
* gcc.target/i386/prefetchi-2.c: Ditto.
* gcc.target/i386/prefetchi-3.c: Ditto.
* gcc.target/i386/prefetchi-4.c: Ditto.
Co-authored-by: Hongtao Liu <hongtao.liu@intel.com>
---
gcc/common/config/i386/cpuinfo.h | 2 +
gcc/common/config/i386/i386-common.cc | 15 ++++
gcc/common/config/i386/i386-cpuinfo.h | 1 +
gcc/common/config/i386/i386-isas.h | 1 +
gcc/config.gcc | 2 +-
gcc/config/i386/cpuid.h | 1 +
gcc/config/i386/i386-builtin-types.def | 4 +
gcc/config/i386/i386-builtin.def | 4 +
gcc/config/i386/i386-c.cc | 2 +
gcc/config/i386/i386-expand.cc | 77 +++++++++++++++++++
gcc/config/i386/i386-isa.def | 1 +
gcc/config/i386/i386-options.cc | 4 +-
gcc/config/i386/i386.md | 23 ++++++
gcc/config/i386/i386.opt | 4 +
gcc/config/i386/predicates.md | 15 ++++
gcc/config/i386/prfchiintrin.h | 49 ++++++++++++
gcc/config/i386/x86gprintrin.h | 2 +
gcc/config/i386/xmmintrin.h | 7 +-
gcc/doc/extend.texi | 5 ++
gcc/doc/invoke.texi | 7 +-
gcc/doc/sourcebuild.texi | 3 +
gcc/testsuite/g++.dg/other/i386-2.C | 2 +-
gcc/testsuite/g++.dg/other/i386-3.C | 2 +-
gcc/testsuite/gcc.target/i386/avx-1.c | 4 +-
gcc/testsuite/gcc.target/i386/funcspec-56.inc | 2 +
gcc/testsuite/gcc.target/i386/prefetchi-1.c | 40 ++++++++++
gcc/testsuite/gcc.target/i386/prefetchi-2.c | 26 +++++++
gcc/testsuite/gcc.target/i386/prefetchi-3.c | 20 +++++
gcc/testsuite/gcc.target/i386/prefetchi-4.c | 19 +++++
gcc/testsuite/gcc.target/i386/sse-13.c | 4 +-
gcc/testsuite/gcc.target/i386/sse-23.c | 4 +-
.../gcc.target/i386/x86gprintrin-1.c | 2 +-
.../gcc.target/i386/x86gprintrin-2.c | 2 +-
.../gcc.target/i386/x86gprintrin-3.c | 2 +-
.../gcc.target/i386/x86gprintrin-4.c | 2 +-
.../gcc.target/i386/x86gprintrin-5.c | 2 +-
36 files changed, 343 insertions(+), 19 deletions(-)
create mode 100644 gcc/config/i386/prfchiintrin.h
create mode 100644 gcc/testsuite/gcc.target/i386/prefetchi-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/prefetchi-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/prefetchi-3.c
create mode 100644 gcc/testsuite/gcc.target/i386/prefetchi-4.c
diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 5951a30aa..f17e88144 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -772,6 +772,8 @@ get_available_features (struct __processor_model *cpu_model,
__cpuid_count (7, 1, eax, ebx, ecx, edx);
if (eax & bit_HRESET)
set_feature (FEATURE_HRESET);
+ if (edx & bit_PREFETCHI)
+ set_feature (FEATURE_PREFETCHI);
if (avx_usable)
{
if (eax & bit_AVXVNNI)
diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
index 922db33ee..c8cf532cf 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -108,6 +108,7 @@ along with GCC; see the file COPYING3. If not see
#define OPTION_MASK_ISA2_AMX_INT8_SET OPTION_MASK_ISA2_AMX_INT8
#define OPTION_MASK_ISA2_AMX_BF16_SET OPTION_MASK_ISA2_AMX_BF16
#define OPTION_MASK_ISA2_AMX_FP16_SET OPTION_MASK_ISA2_AMX_FP16
+#define OPTION_MASK_ISA2_PREFETCHI_SET OPTION_MASK_ISA2_PREFETCHI
/* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
as -msse4.2. */
@@ -277,6 +278,7 @@ along with GCC; see the file COPYING3. If not see
(OPTION_MASK_ISA2_KL | OPTION_MASK_ISA2_WIDEKL_UNSET)
#define OPTION_MASK_ISA2_WIDEKL_UNSET OPTION_MASK_ISA2_WIDEKL
#define OPTION_MASK_ISA2_AMX_FP16_UNSET OPTION_MASK_ISA2_AMX_FP16
+#define OPTION_MASK_ISA2_PREFETCHI_UNSET OPTION_MASK_ISA2_PREFETCHI
/* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same
as -mno-sse4.1. */
@@ -1140,6 +1142,19 @@ ix86_handle_option (struct gcc_options *opts,
}
return true;
+ case OPT_mprefetchi:
+ if (value)
+ {
+ opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_PREFETCHI_SET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_PREFETCHI_SET;
+ }
+ else
+ {
+ opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_PREFETCHI_UNSET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_PREFETCHI_UNSET;
+ }
+ return true;
+
case OPT_mfma:
if (value)
{
diff --git a/gcc/common/config/i386/i386-cpuinfo.h b/gcc/common/config/i386/i386-cpuinfo.h
index 8f22897de..95b078acf 100644
--- a/gcc/common/config/i386/i386-cpuinfo.h
+++ b/gcc/common/config/i386/i386-cpuinfo.h
@@ -241,6 +241,7 @@ enum processor_features
FEATURE_X86_64_V3,
FEATURE_X86_64_V4,
FEATURE_AMX_FP16,
+ FEATURE_PREFETCHI,
CPU_FEATURE_MAX
};
diff --git a/gcc/common/config/i386/i386-isas.h b/gcc/common/config/i386/i386-isas.h
index 95bab6da2..6caf06249 100644
--- a/gcc/common/config/i386/i386-isas.h
+++ b/gcc/common/config/i386/i386-isas.h
@@ -176,4 +176,5 @@ ISA_NAMES_TABLE_START
ISA_NAMES_TABLE_ENTRY("x86-64-v3", FEATURE_X86_64_V3, P_X86_64_V3, NULL)
ISA_NAMES_TABLE_ENTRY("x86-64-v4", FEATURE_X86_64_V4, P_X86_64_V4, NULL)
ISA_NAMES_TABLE_ENTRY("amx-fp16", FEATURE_AMX_FP16, P_NONE, "-mamx-fp16")
+ ISA_NAMES_TABLE_ENTRY("prefetchi", FEATURE_PREFETCHI, P_NONE, "-mprefetchi")
ISA_NAMES_TABLE_END
diff --git a/gcc/config.gcc b/gcc/config.gcc
index e2b4a23dc..81012c651 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -424,7 +424,7 @@ i[34567]86-*-* | x86_64-*-*)
amxbf16intrin.h x86gprintrin.h uintrintrin.h
hresetintrin.h keylockerintrin.h avxvnniintrin.h
mwaitintrin.h avx512fp16intrin.h avx512fp16vlintrin.h
- amxfp16intrin.h"
+ amxfp16intrin.h prfchiintrin.h"
;;
ia64-*-*)
extra_headers=ia64intrin.h
diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index d6cd8d1bf..21100149a 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -50,6 +50,7 @@
/* %edx */
#define bit_CMPXCHG8B (1 << 8)
+#define bit_PREFETCHI (1 << 14)
#define bit_CMOV (1 << 15)
#define bit_MMX (1 << 23)
#define bit_FXSAVE (1 << 24)
diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index e33f06ab3..ff3b0af84 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -1387,3 +1387,7 @@ DEF_FUNCTION_TYPE (V32HF, V32HF)
DEF_FUNCTION_TYPE_ALIAS (V8HF_FTYPE_V8HF, ROUND)
DEF_FUNCTION_TYPE_ALIAS (V16HF_FTYPE_V16HF, ROUND)
DEF_FUNCTION_TYPE_ALIAS (V32HF_FTYPE_V32HF, ROUND)
+
+# PREFETCHI builtins
+DEF_FUNCTION_TYPE (VOID, PCVOID, INT)
+DEF_FUNCTION_TYPE (VOID, PCVOID, INT, INT, INT)
diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 2b1d6c733..d3ab21eea 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -469,6 +469,10 @@ BDESC (0, OPTION_MASK_ISA2_WIDEKL, CODE_FOR_nothing, "__builtin_ia32_aesdecwide2
BDESC (0, OPTION_MASK_ISA2_WIDEKL, CODE_FOR_nothing, "__builtin_ia32_aesencwide128kl_u8", IX86_BUILTIN_AESENCWIDE128KLU8, UNKNOWN, (int) UINT8_FTYPE_PV2DI_PCV2DI_PCVOID)
BDESC (0, OPTION_MASK_ISA2_WIDEKL, CODE_FOR_nothing, "__builtin_ia32_aesencwide256kl_u8", IX86_BUILTIN_AESENCWIDE256KLU8, UNKNOWN, (int) UINT8_FTYPE_PV2DI_PCV2DI_PCVOID)
+/* PREFETCHI */
+BDESC (0, 0, CODE_FOR_prefetchi, "__builtin_ia32_prefetchi", IX86_BUILTIN_PREFETCHI, UNKNOWN, (int) VOID_FTYPE_PCVOID_INT)
+BDESC (0, 0, CODE_FOR_nothing, "__builtin_ia32_prefetch", IX86_BUILTIN_PREFETCH, UNKNOWN, (int) VOID_FTYPE_PCVOID_INT_INT_INT)
+
BDESC_END (SPECIAL_ARGS, PURE_ARGS)
/* AVX */
diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc
index 4269f29e6..00880bd17 100644
--- a/gcc/config/i386/i386-c.cc
+++ b/gcc/config/i386/i386-c.cc
@@ -635,6 +635,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
def_or_undef (parse_in, "__AVXVNNI__");
if (isa_flag2 & OPTION_MASK_ISA2_AMX_FP16)
def_or_undef (parse_in, "__AMX_FP16__");
+ if (isa_flag2 & OPTION_MASK_ISA2_PREFETCHI)
+ def_or_undef (parse_in, "__PREFETCHI__");
if (TARGET_IAMCU)
{
def_or_undef (parse_in, "__iamcu");
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 77dda5dd4..bc2e61980 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -12850,6 +12850,83 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
return target;
}
+ case IX86_BUILTIN_PREFETCH:
+ {
+ arg0 = CALL_EXPR_ARG (exp, 0); // const void *
+ arg1 = CALL_EXPR_ARG (exp, 1); // const int
+ arg2 = CALL_EXPR_ARG (exp, 2); // const int
+ arg3 = CALL_EXPR_ARG (exp, 3); // const int
+
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+ op2 = expand_normal (arg2);
+ op3 = expand_normal (arg3);
+
+ if (!CONST_INT_P (op1) || !CONST_INT_P (op2) || !CONST_INT_P (op3))
+ {
+ error ("second, third and fourth argument must be a const");
+ return const0_rtx;
+ }
+
+ if (INTVAL (op3) == 1)
+ {
+ if (TARGET_64BIT
+ && local_func_symbolic_operand (op0, GET_MODE (op0)))
+ emit_insn (gen_prefetchi (op0, op2));
+ else
+ {
+ warning (0, "instruction prefetch applies when in 64-bit mode"
+ " with RIP-relative addressing and"
+ " option %<-mprefetchi%>;"
+ " they stay NOPs otherwise");
+ emit_insn (gen_nop ());
+ }
+ }
+ else
+ {
+ if (!address_operand (op0, VOIDmode))
+ {
+ op0 = convert_memory_address (Pmode, op0);
+ op0 = copy_addr_to_reg (op0);
+ }
+ emit_insn (gen_prefetch (op0, op1, op2));
+ }
+
+ return 0;
+ }
+
+ case IX86_BUILTIN_PREFETCHI:
+ {
+ arg0 = CALL_EXPR_ARG (exp, 0); // const void *
+ arg1 = CALL_EXPR_ARG (exp, 1); // const int
+
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+
+ if (!CONST_INT_P (op1))
+ {
+ error ("second argument must be a const");
+ return const0_rtx;
+ }
+
+ /* GOT/PLT_PIC should not be available for instruction prefetch.
+ It must be real instruction address. */
+ if (TARGET_64BIT
+ && local_func_symbolic_operand (op0, GET_MODE (op0)))
+ emit_insn (gen_prefetchi (op0, op1));
+ else
+ {
+ /* Ignore the hint. */
+ warning (0, "instruction prefetch applies when in 64-bit mode"
+ " with RIP-relative addressing and"
+ " option %<-mprefetchi%>;"
+ " they stay NOPs otherwise");
+ emit_insn (gen_nop ());
+ }
+
+ return 0;
+ }
+
case IX86_BUILTIN_VEC_INIT_V2SI:
case IX86_BUILTIN_VEC_INIT_V4HI:
case IX86_BUILTIN_VEC_INIT_V8QI:
diff --git a/gcc/config/i386/i386-isa.def b/gcc/config/i386/i386-isa.def
index c7305c01b..744a7df85 100644
--- a/gcc/config/i386/i386-isa.def
+++ b/gcc/config/i386/i386-isa.def
@@ -110,3 +110,4 @@ DEF_PTA(WIDEKL)
DEF_PTA(AVXVNNI)
DEF_PTA(AVX512FP16)
DEF_PTA(AMX_FP16)
+DEF_PTA(PREFETCHI)
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 3edb7094e..724375f02 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -231,7 +231,8 @@ static struct ix86_target_opts isa2_opts[] =
{ "-mwidekl", OPTION_MASK_ISA2_WIDEKL },
{ "-mavxvnni", OPTION_MASK_ISA2_AVXVNNI },
{ "-mavx512fp16", OPTION_MASK_ISA2_AVX512FP16 },
- { "-mamx-fp16", OPTION_MASK_ISA2_AMX_FP16 }
+ { "-mamx-fp16", OPTION_MASK_ISA2_AMX_FP16 },
+ { "-mprefetchi", OPTION_MASK_ISA2_PREFETCHI }
};
static struct ix86_target_opts isa_opts[] =
{
@@ -1076,6 +1077,7 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
IX86_ATTR_ISA ("avxvnni", OPT_mavxvnni),
IX86_ATTR_ISA ("avx512fp16", OPT_mavx512fp16),
IX86_ATTR_ISA ("amx-fp16", OPT_mamx_fp16),
+ IX86_ATTR_ISA ("prefetchi", OPT_mprefetchi),
/* enum options */
IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 71691f598..f08c2cfb1 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -329,6 +329,9 @@
;; For HRESET support
UNSPECV_HRESET
+
+ ;; For PREFETCHI support
+ UNSPECV_PREFETCHI
])
;; Constants to represent rounding modes in the ROUND instruction
@@ -22907,6 +22910,26 @@
(symbol_ref "memory_address_length (operands[0], false)"))
(set_attr "memory" "none")])
+(define_insn "prefetchi"
+ [(unspec_volatile [(match_operand 0 "local_func_symbolic_operand" "p")
+ (match_operand:SI 1 "const_int_operand")]
+ UNSPECV_PREFETCHI)]
+ "TARGET_PREFETCHI && TARGET_64BIT"
+{
+ static const char * const patterns[2] = {
+ "prefetchit1\t%0", "prefetchit0\t%0"
+ };
+
+ int locality = INTVAL (operands[1]);
+ gcc_assert (IN_RANGE (locality, 2, 3));
+
+ return patterns[locality - 2];
+}
+ [(set_attr "type" "sse")
+ (set (attr "length_address")
+ (symbol_ref "memory_address_length (operands[0], false)"))
+ (set_attr "memory" "none")])
+
(define_expand "stack_protect_set"
[(match_operand 0 "memory_operand")
(match_operand 1 "memory_operand")]
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 52c6f02ee..50cd114f6 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1230,3 +1230,7 @@ Enable vectorization for scatter instruction.
mamx-fp16
Target Mask(ISA2_AMX_FP16) Var(ix86_isa_flags2) Save
Support AMX-FP16 built-in functions and code generation.
+
+mprefetchi
+Target Mask(ISA2_PREFETCHI) Var(ix86_isa_flags2) Save
+Support PREFETCHI built-in functions and code generation.
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index ac02c61ac..774178b78 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -610,6 +610,21 @@
return false;
})
+(define_predicate "local_func_symbolic_operand"
+ (match_operand 0 "local_symbolic_operand")
+{
+ if (GET_CODE (op) == CONST
+ && GET_CODE (XEXP (op, 0)) == PLUS
+ && CONST_INT_P (XEXP (XEXP (op, 0), 1)))
+ op = XEXP (XEXP (op, 0), 0);
+
+ if (GET_CODE (op) == SYMBOL_REF
+ && !SYMBOL_REF_FUNCTION_P (op))
+ return false;
+
+ return true;
+})
+
;; Test for a legitimate @GOTOFF operand.
;;
;; VxWorks does not impose a fixed gap between segments; the run-time
diff --git a/gcc/config/i386/prfchiintrin.h b/gcc/config/i386/prfchiintrin.h
new file mode 100644
index 000000000..06deef488
--- /dev/null
+++ b/gcc/config/i386/prfchiintrin.h
@@ -0,0 +1,49 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _X86GPRINTRIN_H_INCLUDED
+# error "Never use <prfchiintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _PRFCHIINTRIN_H_INCLUDED
+#define _PRFCHIINTRIN_H_INCLUDED
+
+#ifdef __x86_64__
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_prefetchit0 (void* __P)
+{
+ __builtin_ia32_prefetchi (__P, 3);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_prefetchit1 (void* __P)
+{
+ __builtin_ia32_prefetchi (__P, 2);
+}
+
+#endif
+
+#endif /* _PRFCHIINTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/x86gprintrin.h b/gcc/config/i386/x86gprintrin.h
index e0be01d5e..0768aa0d7 100644
--- a/gcc/config/i386/x86gprintrin.h
+++ b/gcc/config/i386/x86gprintrin.h
@@ -72,6 +72,8 @@
#include <pkuintrin.h>
+#include <prfchiintrin.h>
+
#include <rdseedintrin.h>
#include <rtmintrin.h>
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index f1c704a2d..7fb179430 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -36,6 +36,8 @@
/* Constants for use with _mm_prefetch. */
enum _mm_hint
{
+ _MM_HINT_IT0 = 19,
+ _MM_HINT_IT1 = 18,
/* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
_MM_HINT_ET0 = 7,
_MM_HINT_ET1 = 6,
@@ -51,11 +53,12 @@ enum _mm_hint
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_prefetch (const void *__P, enum _mm_hint __I)
{
- __builtin_prefetch (__P, (__I & 0x4) >> 2, __I & 0x3);
+ __builtin_ia32_prefetch (__P, (__I & 0x4) >> 2,
+ __I & 0x3, (__I & 0x10) >> 4);
}
#else
#define _mm_prefetch(P, I) \
- __builtin_prefetch ((P), ((I & 0x4) >> 2), (I & 0x3))
+ __builtin_ia32_prefetch ((P), ((I) & 0x4) >> 2, ((I) & 0x3), ((I) & 0x10) >> 4)
#endif
#ifndef __SSE__
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 4ba9d34cd..cb987f469 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -7043,6 +7043,11 @@ Enable/disable the generation of the AVXVNNI instructions.
@cindex @code{target("amx-fp16")} function attribute, x86
Enable/disable the generation of the AMX-FP16 instructions.
+@item prefetchi
+@itemx no-prefetchi
+@cindex @code{target("prefetchi")} function attribute, x86
+Enable/disable the generation of the PREFETCHI instructions.
+
@item cld
@itemx no-cld
@cindex @code{target("cld")} function attribute, x86
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index d25f13217..211b970c0 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1428,7 +1428,7 @@ See RS/6000 and PowerPC Options.
-mavx5124fmaps -mavx512vnni -mavx5124vnniw -mprfchw -mrdpid @gol
-mrdseed -msgx -mavx512vp2intersect -mserialize -mtsxldtrk@gol
-mamx-tile -mamx-int8 -mamx-bf16 -muintr -mhreset -mavxvnni@gol
--mavx512fp16 -mamx-fp16 @gol
+-mavx512fp16 -mamx-fp16 -mprefetchi @gol
-mcldemote -mms-bitfields -mno-align-stringops -minline-all-stringops @gol
-minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
-mkl -mwidekl @gol
@@ -32445,6 +32445,9 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}.
@need 200
@itemx -mamx-fp16
@opindex mamx-fp16
+@need 200
+@itemx -mprefetchi
+@opindex mprefetchi
These switches enable the use of instructions in the MMX, SSE,
SSE2, SSE3, SSSE3, SSE4, SSE4A, SSE4.1, SSE4.2, AVX, AVX2, AVX512F, AVX512PF,
AVX512ER, AVX512CD, AVX512VL, AVX512BW, AVX512DQ, AVX512IFMA, AVX512VBMI, SHA,
@@ -32455,7 +32458,7 @@ XSAVEOPT, XSAVEC, XSAVES, RTM, HLE, TBM, MWAITX, CLZERO, PKU, AVX512VBMI2,
GFNI, VAES, WAITPKG, VPCLMULQDQ, AVX512BITALG, MOVDIRI, MOVDIR64B, AVX512BF16,
ENQCMD, AVX512VPOPCNTDQ, AVX5124FMAPS, AVX512VNNI, AVX5124VNNIW, SERIALIZE,
UINTR, HRESET, AMXTILE, AMXINT8, AMXBF16, KL, WIDEKL, AVXVNNI, AVX512-FP16,
-AMX-FP16 or CLDEMOTE extended instruction sets. Each has a corresponding
+AMX-FP16, PREFETCHI or CLDEMOTE extended instruction sets. Each has a corresponding
@option{-mno-} option to disable use of these instructions.
These extensions are also available as built-in functions: see
diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
index b64b62dee..c68e492dc 100644
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@@ -2496,6 +2496,9 @@ Target does not require strict alignment.
@item pie_copyreloc
The x86-64 target linker supports PIE with copy reloc.
+@item prefetchi
+Target supports the execution of @code{prefetchi} instructions.
+
@item rdrand
Target supports x86 @code{rdrand} instruction.
diff --git a/gcc/testsuite/g++.dg/other/i386-2.C b/gcc/testsuite/g++.dg/other/i386-2.C
index 57a6357aa..72ed5fed0 100644
--- a/gcc/testsuite/g++.dg/other/i386-2.C
+++ b/gcc/testsuite/g++.dg/other/i386-2.C
@@ -1,5 +1,5 @@
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mamx-fp16" } */
+/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mamx-fp16 -mprefetchi" } */
/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
diff --git a/gcc/testsuite/g++.dg/other/i386-3.C b/gcc/testsuite/g++.dg/other/i386-3.C
index 1947547d6..9dd53653f 100644
--- a/gcc/testsuite/g++.dg/other/i386-3.C
+++ b/gcc/testsuite/g++.dg/other/i386-3.C
@@ -1,5 +1,5 @@
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mamx-fp16" } */
+/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mamx-fp16 -mprefetchi" } */
/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
diff --git a/gcc/testsuite/gcc.target/i386/avx-1.c b/gcc/testsuite/gcc.target/i386/avx-1.c
index 154e7b3b1..2b46e1b87 100644
--- a/gcc/testsuite/gcc.target/i386/avx-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx-1.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -maes -mpclmul -mgfni -mavx512bw -mavx512fp16 -mavx512vl" } */
+/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -mavx2 -maes -mpclmul -mgfni -mavx512bw -mavx512fp16 -mavx512vl -mprefetchi" } */
/* { dg-add-options bind_pic_locally } */
#include <mm_malloc.h>
@@ -153,7 +153,7 @@
#define __builtin_ia32_shufpd(A, B, N) __builtin_ia32_shufpd(A, B, 0)
/* xmmintrin.h */
-#define __builtin_prefetch(P, A, I) __builtin_prefetch(P, 0, _MM_HINT_NTA)
+#define __builtin_ia32_prefetch(A, B, C, D) __builtin_ia32_prefetch(A, 0, 3, 0)
#define __builtin_ia32_pshufw(A, N) __builtin_ia32_pshufw(A, 0)
#define __builtin_ia32_vec_set_v4hi(A, D, N) \
__builtin_ia32_vec_set_v4hi(A, D, 0)
diff --git a/gcc/testsuite/gcc.target/i386/funcspec-56.inc b/gcc/testsuite/gcc.target/i386/funcspec-56.inc
index b00cfff03..9f073f78c 100644
--- a/gcc/testsuite/gcc.target/i386/funcspec-56.inc
+++ b/gcc/testsuite/gcc.target/i386/funcspec-56.inc
@@ -81,6 +81,7 @@ extern void test_widekl (void) __attribute__((__target__("widekl")));
extern void test_avxvnni (void) __attribute__((__target__("avxvnni")));
extern void test_avx512fp16 (void) __attribute__((__target__("avx512fp16")));
extern void test_amx_fp16 (void) __attribute__((__target__("amx-fp16")));
+extern void test_prefetchi (void) __attribute__((__target__("prefetchi")));
extern void test_no_sgx (void) __attribute__((__target__("no-sgx")));
extern void test_no_avx5124fmaps(void) __attribute__((__target__("no-avx5124fmaps")));
@@ -163,6 +164,7 @@ extern void test_no_widekl (void) __attribute__((__target__("no-widekl")));
extern void test_no_avxvnni (void) __attribute__((__target__("no-avxvnni")));
extern void test_no_avx512fp16 (void) __attribute__((__target__("no-avx512fp16")));
extern void test_no_amx_fp16 (void) __attribute__((__target__("no-amx-fp16")));
+extern void test_no_prefetchi (void) __attribute__((__target__("no-prefetchi")));
extern void test_arch_nocona (void) __attribute__((__target__("arch=nocona")));
extern void test_arch_core2 (void) __attribute__((__target__("arch=core2")));
diff --git a/gcc/testsuite/gcc.target/i386/prefetchi-1.c b/gcc/testsuite/gcc.target/i386/prefetchi-1.c
new file mode 100644
index 000000000..80f25e70e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/prefetchi-1.c
@@ -0,0 +1,40 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mprefetchi -O2" } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit0\[ \\t\]+" 2 } } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit1\[ \\t\]+" 2 } } */
+
+#include <x86intrin.h>
+
+int
+bar (int a)
+{
+ return a + 1;
+}
+
+int
+foo1 (int b)
+{
+ _mm_prefetch (bar, _MM_HINT_IT0);
+ return bar (b) + 1;
+}
+
+int
+foo2 (int b)
+{
+ _mm_prefetch (bar, _MM_HINT_IT1);
+ return bar (b) + 1;
+}
+
+int
+foo3 (int b)
+{
+ _m_prefetchit0 (bar);
+ return bar (b) + 1;
+}
+
+int
+foo4 (int b)
+{
+ _m_prefetchit1 (bar);
+ return bar (b) + 1;
+}
diff --git a/gcc/testsuite/gcc.target/i386/prefetchi-2.c b/gcc/testsuite/gcc.target/i386/prefetchi-2.c
new file mode 100644
index 000000000..e05ce9c73
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/prefetchi-2.c
@@ -0,0 +1,26 @@
+/* { dg-do compile { target { ia32 } } } */
+/* { dg-options "-mprefetchi -O2" } */
+/* { dg-final { scan-assembler-not "\[ \\t\]+prefetchit0" } } */
+/* { dg-final { scan-assembler-not "\[ \\t\]+prefetchit1" } } */
+
+#include <x86intrin.h>
+
+int
+bar (int a)
+{
+ return a + 1;
+}
+
+int
+foo1 (int b)
+{
+ __builtin_ia32_prefetch (bar, 0, 3, 1); /* { dg-warning "instruction prefetch applies when in 64-bit mode with RIP-relative addressing and option '-mprefetchi'; they stay NOPs otherwise" } */
+ return bar (b) + 1;
+}
+
+int
+foo2 (int b)
+{
+ __builtin_ia32_prefetchi (bar, 2); /* { dg-warning "instruction prefetch applies when in 64-bit mode with RIP-relative addressing and option '-mprefetchi'; they stay NOPs otherwise" } */
+ return bar (b) + 1;
+}
diff --git a/gcc/testsuite/gcc.target/i386/prefetchi-3.c b/gcc/testsuite/gcc.target/i386/prefetchi-3.c
new file mode 100644
index 000000000..f0a4173d2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/prefetchi-3.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-mprefetchi -O2" } */
+/* { dg-final { scan-assembler-not "prefetchit0" } } */
+/* { dg-final { scan-assembler-not "prefetchit1" } } */
+
+#include <x86intrin.h>
+
+void* p;
+
+void extern
+prefetchi_test1 (void)
+{
+ __builtin_ia32_prefetchi (p, 2); /* { dg-warning "instruction prefetch applies when in 64-bit mode with RIP-relative addressing and option '-mprefetchi'; they stay NOPs otherwise" } */
+}
+
+void extern
+prefetchi_test2 (void)
+{
+ __builtin_ia32_prefetch (p, 0, 3, 1); /* { dg-warning "instruction prefetch applies when in 64-bit mode with RIP-relative addressing and option '-mprefetchi'; they stay NOPs otherwise" } */
+}
diff --git a/gcc/testsuite/gcc.target/i386/prefetchi-4.c b/gcc/testsuite/gcc.target/i386/prefetchi-4.c
new file mode 100644
index 000000000..73ae596d1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/prefetchi-4.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O0" } */
+
+#include <x86intrin.h>
+
+void* p;
+
+void extern
+prefetch_test (void)
+{
+ __builtin_ia32_prefetch (p, 0, 3, 0);
+ __builtin_ia32_prefetch (p, 0, 2, 0);
+ __builtin_ia32_prefetch (p, 0, 1, 0);
+ __builtin_ia32_prefetch (p, 0, 0, 0);
+ __builtin_ia32_prefetch (p, 1, 3, 0);
+ __builtin_ia32_prefetch (p, 1, 2, 0);
+ __builtin_ia32_prefetch (p, 1, 1, 0);
+ __builtin_ia32_prefetch (p, 1, 0, 0);
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse-13.c b/gcc/testsuite/gcc.target/i386/sse-13.c
index a1e453a98..db7c0fc7a 100644
--- a/gcc/testsuite/gcc.target/i386/sse-13.c
+++ b/gcc/testsuite/gcc.target/i386/sse-13.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mavx512vp2intersect -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mamx-fp16" } */
+/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mavx512vp2intersect -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mamx-fp16 -mprefetchi" } */
/* { dg-add-options bind_pic_locally } */
#include <mm_malloc.h>
@@ -125,7 +125,7 @@
#define __builtin_ia32_shufpd(A, B, N) __builtin_ia32_shufpd(A, B, 0)
/* xmmintrin.h */
-#define __builtin_prefetch(P, A, I) __builtin_prefetch(P, 0, _MM_HINT_NTA)
+#define __builtin_ia32_prefetch(A, B, C, D) __builtin_ia32_prefetch(A, 0, 3, 0)
#define __builtin_ia32_pshufw(A, N) __builtin_ia32_pshufw(A, 0)
#define __builtin_ia32_vec_set_v4hi(A, D, N) \
__builtin_ia32_vec_set_v4hi(A, D, 0)
diff --git a/gcc/testsuite/gcc.target/i386/sse-23.c b/gcc/testsuite/gcc.target/i386/sse-23.c
index 151201d97..741694e87 100644
--- a/gcc/testsuite/gcc.target/i386/sse-23.c
+++ b/gcc/testsuite/gcc.target/i386/sse-23.c
@@ -94,7 +94,7 @@
#define __builtin_ia32_shufpd(A, B, N) __builtin_ia32_shufpd(A, B, 0)
/* xmmintrin.h */
-#define __builtin_prefetch(P, A, I) __builtin_prefetch(P, 0, _MM_HINT_NTA)
+#define __builtin_ia32_prefetch(A, B, C, D) __builtin_ia32_prefetch(A, 0, 3, 0)
#define __builtin_ia32_pshufw(A, N) __builtin_ia32_pshufw(A, 0)
#define __builtin_ia32_vec_set_v4hi(A, D, N) \
__builtin_ia32_vec_set_v4hi(A, D, 0)
@@ -843,6 +843,6 @@
#define __builtin_ia32_vpclmulqdq_v2di(A, B, C) __builtin_ia32_vpclmulqdq_v2di(A, B, 1)
#define __builtin_ia32_vpclmulqdq_v8di(A, B, C) __builtin_ia32_vpclmulqdq_v8di(A, B, 1)
-#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,xsavec,xsaves,clflushopt,avx512bw,avx512dq,avx512vl,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,avx512vbmi2,vpclmulqdq,avx512bitalg,pconfig,wbnoinvd,avx512bf16,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,amx-fp16")
+#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,xsavec,xsaves,clflushopt,avx512bw,avx512dq,avx512vl,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,avx512vbmi2,vpclmulqdq,avx512bitalg,pconfig,wbnoinvd,avx512bf16,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,amx-fp16,prefetchi")
#include <x86intrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/x86gprintrin-1.c b/gcc/testsuite/gcc.target/i386/x86gprintrin-1.c
index 293be094b..efe7df13b 100644
--- a/gcc/testsuite/gcc.target/i386/x86gprintrin-1.c
+++ b/gcc/testsuite/gcc.target/i386/x86gprintrin-1.c
@@ -1,7 +1,7 @@
/* Test that <x86gprintrin.h> is usable with -O -std=c89 -pedantic-errors. */
/* { dg-do compile } */
/* { dg-options "-O -std=c89 -pedantic-errors -march=x86-64 -madx -mbmi -mbmi2 -mcldemote -mclflushopt -mclwb -mclzero -menqcmd -mfsgsbase -mfxsr -mhreset -mlzcnt -mlwp -mmovdiri -mmwaitx -mpconfig -mpopcnt -mpku -mptwrite -mrdpid -mrdrnd -mrdseed -mrtm -mserialize -msgx -mshstk -mtbm -mtsxldtrk -mwaitpkg -mwbnoinvd -mxsave -mxsavec -mxsaveopt -mxsaves -mno-sse -mno-mmx" } */
-/* { dg-additional-options "-muintr" { target { ! ia32 } } } */
+/* { dg-additional-options "-muintr -mprefetchi" { target { ! ia32 } } } */
#include <x86gprintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/x86gprintrin-2.c b/gcc/testsuite/gcc.target/i386/x86gprintrin-2.c
index c63302757..5f6970df6 100644
--- a/gcc/testsuite/gcc.target/i386/x86gprintrin-2.c
+++ b/gcc/testsuite/gcc.target/i386/x86gprintrin-2.c
@@ -1,7 +1,7 @@
/* { dg-do compile } */
/* { dg-options "-O2 -Werror-implicit-function-declaration -march=x86-64 -madx -mbmi -mbmi2 -mcldemote -mclflushopt -mclwb -mclzero -menqcmd -mfsgsbase -mfxsr -mhreset -mlzcnt -mlwp -mmovdiri -mmwaitx -mpconfig -mpopcnt -mpku -mptwrite -mrdpid -mrdrnd -mrdseed -mrtm -mserialize -msgx -mshstk -mtbm -mtsxldtrk -mwaitpkg -mwbnoinvd -mxsave -mxsavec -mxsaveopt -mxsaves -mno-sse -mno-mmx" } */
/* { dg-add-options bind_pic_locally } */
-/* { dg-additional-options "-muintr" { target { ! ia32 } } } */
+/* { dg-additional-options "-muintr -mprefetchi" { target { ! ia32 } } } */
/* Test that the intrinsics in <x86gprintrin.h> compile with optimization.
All of them are defined as inline functions that reference the proper
diff --git a/gcc/testsuite/gcc.target/i386/x86gprintrin-3.c b/gcc/testsuite/gcc.target/i386/x86gprintrin-3.c
index 3a7e1f4a1..5c075c375 100644
--- a/gcc/testsuite/gcc.target/i386/x86gprintrin-3.c
+++ b/gcc/testsuite/gcc.target/i386/x86gprintrin-3.c
@@ -1,7 +1,7 @@
/* { dg-do compile } */
/* { dg-options "-O0 -Werror-implicit-function-declaration -march=x86-64 -madx -mbmi -mbmi2 -mcldemote -mclflushopt -mclwb -mclzero -menqcmd -mfsgsbase -mfxsr -mhreset -mlzcnt -mlwp -mmovdiri -mmwaitx -mpconfig -mpopcnt -mpku -mptwrite -mrdpid -mrdrnd -mrdseed -mrtm -mserialize -msgx -mshstk -mtbm -mtsxldtrk -mwaitpkg -mwbnoinvd -mxsave -mxsavec -mxsaveopt -mxsaves -mno-sse -mno-mmx" } */
/* { dg-add-options bind_pic_locally } */
-/* { dg-additional-options "-muintr" { target { ! ia32 } } } */
+/* { dg-additional-options "-muintr -mprefetchi" { target { ! ia32 } } } */
/* Test that the intrinsics in <x86gprintrin.h> compile without optimization.
All of them are defined as inline functions that reference the proper
diff --git a/gcc/testsuite/gcc.target/i386/x86gprintrin-4.c b/gcc/testsuite/gcc.target/i386/x86gprintrin-4.c
index d8a6126e5..bda4ecea3 100644
--- a/gcc/testsuite/gcc.target/i386/x86gprintrin-4.c
+++ b/gcc/testsuite/gcc.target/i386/x86gprintrin-4.c
@@ -15,7 +15,7 @@
#ifndef DIFFERENT_PRAGMAS
#ifdef __x86_64__
-#pragma GCC target ("adx,bmi,bmi2,fsgsbase,fxsr,hreset,lwp,lzcnt,popcnt,rdrnd,rdseed,tbm,rtm,serialize,tsxldtrk,uintr,xsaveopt")
+#pragma GCC target ("adx,bmi,bmi2,fsgsbase,fxsr,hreset,lwp,lzcnt,popcnt,prefetchi,rdrnd,rdseed,tbm,rtm,serialize,tsxldtrk,uintr,xsaveopt")
#else
#pragma GCC target ("adx,bmi,bmi2,fsgsbase,fxsr,hreset,lwp,lzcnt,popcnt,rdrnd,rdseed,tbm,rtm,serialize,tsxldtrk,xsaveopt")
#endif
diff --git a/gcc/testsuite/gcc.target/i386/x86gprintrin-5.c b/gcc/testsuite/gcc.target/i386/x86gprintrin-5.c
index 9ef66fdad..4aadfd0b3 100644
--- a/gcc/testsuite/gcc.target/i386/x86gprintrin-5.c
+++ b/gcc/testsuite/gcc.target/i386/x86gprintrin-5.c
@@ -28,7 +28,7 @@
#define __builtin_ia32_xabort(M) __builtin_ia32_xabort(1)
#ifdef __x86_64__
-#pragma GCC target ("adx,bmi,bmi2,clflushopt,clwb,clzero,enqcmd,fsgsbase,fxsr,hreset,lwp,lzcnt,mwaitx,pconfig,pku,popcnt,rdpid,rdrnd,rdseed,tbm,rtm,serialize,sgx,tsxldtrk,uintr,xsavec,xsaveopt,xsaves,wbnoinvd")
+#pragma GCC target ("adx,bmi,bmi2,clflushopt,clwb,clzero,enqcmd,fsgsbase,fxsr,hreset,lwp,lzcnt,mwaitx,pconfig,pku,popcnt,prefetchi,rdpid,rdrnd,rdseed,tbm,rtm,serialize,sgx,tsxldtrk,uintr,xsavec,xsaveopt,xsaves,wbnoinvd")
#else
#pragma GCC target ("adx,bmi,bmi2,clflushopt,clwb,clzero,enqcmd,fsgsbase,fxsr,hreset,lwp,lzcnt,mwaitx,pconfig,pku,popcnt,rdpid,rdrnd,rdseed,tbm,rtm,serialize,sgx,tsxldtrk,xsavec,xsaveopt,xsaves,wbnoinvd")
#endif
--
2.28.0.windows.1

View File

@ -0,0 +1,277 @@
From 7f0f8b585cf60b4c09bca42b5339995c2cc74633 Mon Sep 17 00:00:00 2001
From: Haochen Jiang <haochen.jiang@intel.com>
Date: Mon, 7 Nov 2022 11:04:57 +0800
Subject: [PATCH 22/32] Initial Granite Rapids Support
gcc/ChangeLog:
* common/config/i386/cpuinfo.h
(get_intel_cpu): Handle Granite Rapids.
* common/config/i386/i386-common.cc:
(processor_names): Add graniterapids.
(processor_alias_table): Ditto.
* common/config/i386/i386-cpuinfo.h
(enum processor_subtypes): Add INTEL_GRANTIERAPIDS.
* config.gcc: Add -march=graniterapids.
* config/i386/driver-i386.cc (host_detect_local_cpu):
Handle graniterapids.
* config/i386/i386-c.cc (ix86_target_macros_internal):
Ditto.
* config/i386/i386-options.cc (m_GRANITERAPIDS): New.
(processor_cost_table): Add graniterapids.
* config/i386/i386.h (enum processor_type):
Add PROCESSOR_GRANITERAPIDS.
(PTA_GRANITERAPIDS): Ditto.
* doc/extend.texi: Add graniterapids.
* doc/invoke.texi: Ditto.
gcc/testsuite/ChangeLog:
* g++.target/i386/mv16.C: Add graniterapids.
* gcc.target/i386/funcspec-56.inc: Handle new march.
(cherry picked from commit 339ffc5a792dd66647392a235f2f7f6344c5359e)
---
gcc/common/config/i386/cpuinfo.h | 9 +++++++++
gcc/common/config/i386/i386-common.cc | 3 +++
gcc/common/config/i386/i386-cpuinfo.h | 1 +
gcc/config.gcc | 2 +-
gcc/config/i386/driver-i386.cc | 5 ++++-
gcc/config/i386/i386-c.cc | 7 +++++++
gcc/config/i386/i386-options.cc | 4 +++-
gcc/config/i386/i386.h | 3 +++
gcc/doc/extend.texi | 3 +++
gcc/doc/invoke.texi | 11 +++++++++++
gcc/testsuite/g++.target/i386/mv16.C | 6 ++++++
gcc/testsuite/gcc.target/i386/funcspec-56.inc | 1 +
12 files changed, 52 insertions(+), 3 deletions(-)
diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index f17e88144..1f75ff1ca 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -528,6 +528,15 @@ get_intel_cpu (struct __processor_model *cpu_model,
cpu_model->__cpu_type = INTEL_COREI7;
cpu_model->__cpu_subtype = INTEL_COREI7_SAPPHIRERAPIDS;
break;
+ case 0xad:
+ case 0xae:
+ /* Granite Rapids. */
+ cpu = "graniterapids";
+ CHECK___builtin_cpu_is ("corei7");
+ CHECK___builtin_cpu_is ("graniterapids");
+ cpu_model->__cpu_type = INTEL_COREI7;
+ cpu_model->__cpu_subtype = INTEL_COREI7_GRANITERAPIDS;
+ break;
case 0x17:
case 0x1d:
/* Penryn. */
diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
index c8cf532cf..1aa163463 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -1855,6 +1855,7 @@ const char *const processor_names[] =
"sapphirerapids",
"alderlake",
"rocketlake",
+ "graniterapids",
"intel",
"geode",
"k6",
@@ -1973,6 +1974,8 @@ const pta processor_alias_table[] =
M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2},
{"meteorlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE,
M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2},
+ {"graniterapids", PROCESSOR_GRANITERAPIDS, CPU_HASWELL, PTA_GRANITERAPIDS,
+ M_CPU_SUBTYPE (INTEL_COREI7_GRANITERAPIDS), P_PROC_AVX512F},
{"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL,
M_CPU_TYPE (INTEL_BONNELL), P_PROC_SSSE3},
{"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL,
diff --git a/gcc/common/config/i386/i386-cpuinfo.h b/gcc/common/config/i386/i386-cpuinfo.h
index 95b078acf..7b2d4d242 100644
--- a/gcc/common/config/i386/i386-cpuinfo.h
+++ b/gcc/common/config/i386/i386-cpuinfo.h
@@ -92,6 +92,7 @@ enum processor_subtypes
AMDFAM19H_ZNVER3,
INTEL_COREI7_ROCKETLAKE,
AMDFAM19H_ZNVER4,
+ INTEL_COREI7_GRANITERAPIDS,
CPU_SUBTYPE_MAX
};
diff --git a/gcc/config.gcc b/gcc/config.gcc
index 81012c651..9bad238e3 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -670,7 +670,7 @@ slm nehalem westmere sandybridge ivybridge haswell broadwell bonnell \
silvermont knl knm skylake-avx512 cannonlake icelake-client icelake-server \
skylake goldmont goldmont-plus tremont cascadelake tigerlake cooperlake \
sapphirerapids alderlake rocketlake eden-x2 nano nano-1000 nano-2000 nano-3000 \
-nano-x2 eden-x4 nano-x4 x86-64 x86-64-v2 x86-64-v3 x86-64-v4 native"
+nano-x2 eden-x4 nano-x4 x86-64 x86-64-v2 x86-64-v3 x86-64-v4 graniterapids native"
# Additional x86 processors supported by --with-cpu=. Each processor
# MUST be separated by exactly one space.
diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc
index 3b5161aed..ea8c3d8d1 100644
--- a/gcc/config/i386/driver-i386.cc
+++ b/gcc/config/i386/driver-i386.cc
@@ -576,8 +576,11 @@ const char *host_detect_local_cpu (int argc, const char **argv)
/* This is unknown family 0x6 CPU. */
if (has_feature (FEATURE_AVX))
{
+ /* Assume Granite Rapids. */
+ if (has_feature (FEATURE_AMX_FP16))
+ cpu = "graniterapids";
/* Assume Tiger Lake */
- if (has_feature (FEATURE_AVX512VP2INTERSECT))
+ else if (has_feature (FEATURE_AVX512VP2INTERSECT))
cpu = "tigerlake";
/* Assume Sapphire Rapids. */
else if (has_feature (FEATURE_TSXLDTRK))
diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc
index 00880bd17..04f1dd682 100644
--- a/gcc/config/i386/i386-c.cc
+++ b/gcc/config/i386/i386-c.cc
@@ -242,6 +242,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
def_or_undef (parse_in, "__sapphirerapids");
def_or_undef (parse_in, "__sapphirerapids__");
break;
+ case PROCESSOR_GRANITERAPIDS:
+ def_or_undef (parse_in, "__graniterapids");
+ def_or_undef (parse_in, "__graniterapids__");
+ break;
case PROCESSOR_ALDERLAKE:
def_or_undef (parse_in, "__alderlake");
def_or_undef (parse_in, "__alderlake__");
@@ -419,6 +423,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
case PROCESSOR_ROCKETLAKE:
def_or_undef (parse_in, "__tune_rocketlake__");
break;
+ case PROCESSOR_GRANITERAPIDS:
+ def_or_undef (parse_in, "__tune_graniterapids__");
+ break;
case PROCESSOR_INTEL:
case PROCESSOR_GENERIC:
break;
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 724375f02..6645e3259 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -127,10 +127,11 @@ along with GCC; see the file COPYING3. If not see
#define m_SAPPHIRERAPIDS (HOST_WIDE_INT_1U<<PROCESSOR_SAPPHIRERAPIDS)
#define m_ALDERLAKE (HOST_WIDE_INT_1U<<PROCESSOR_ALDERLAKE)
#define m_ROCKETLAKE (HOST_WIDE_INT_1U<<PROCESSOR_ROCKETLAKE)
+#define m_GRANITERAPIDS (HOST_WIDE_INT_1U<<PROCESSOR_GRANITERAPIDS)
#define m_CORE_AVX512 (m_SKYLAKE_AVX512 | m_CANNONLAKE \
| m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_CASCADELAKE \
| m_TIGERLAKE | m_COOPERLAKE | m_SAPPHIRERAPIDS \
- | m_ROCKETLAKE)
+ | m_ROCKETLAKE | m_GRANITERAPIDS)
#define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_CORE_AVX512)
#define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2)
#define m_GOLDMONT (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT)
@@ -761,6 +762,7 @@ static const struct processor_costs *processor_cost_table[] =
&icelake_cost,
&alderlake_cost,
&icelake_cost,
+ &icelake_cost,
&intel_cost,
&geode_cost,
&k6_cost,
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index aaa136ba0..75953defc 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -2250,6 +2250,7 @@ enum processor_type
PROCESSOR_SAPPHIRERAPIDS,
PROCESSOR_ALDERLAKE,
PROCESSOR_ROCKETLAKE,
+ PROCESSOR_GRANITERAPIDS,
PROCESSOR_INTEL,
PROCESSOR_GEODE,
PROCESSOR_K6,
@@ -2356,6 +2357,8 @@ constexpr wide_int_bitmask PTA_ALDERLAKE = PTA_TREMONT | PTA_ADX | PTA_AVX
| PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_LZCNT
| PTA_PCONFIG | PTA_PKU | PTA_VAES | PTA_VPCLMULQDQ | PTA_SERIALIZE
| PTA_HRESET | PTA_KL | PTA_WIDEKL | PTA_AVXVNNI;
+constexpr wide_int_bitmask PTA_GRANITERAPIDS = PTA_SAPPHIRERAPIDS | PTA_AMX_FP16
+ | PTA_PREFETCHI;
constexpr wide_int_bitmask PTA_KNM = PTA_KNL | PTA_AVX5124VNNIW
| PTA_AVX5124FMAPS | PTA_AVX512VPOPCNTDQ;
constexpr wide_int_bitmask PTA_ZNVER1 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index cb987f469..ba9faf4b2 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -21829,6 +21829,9 @@ Intel Core i7 Alderlake CPU.
@item rocketlake
Intel Core i7 Rocketlake CPU.
+@item graniterapids
+Intel Core i7 graniterapids CPU.
+
@item bonnell
Intel Atom Bonnell CPU.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 211b970c0..8ca831dc1 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -31615,6 +31615,17 @@ CLFLUSHOPT, XSAVEC, XSAVES, AVX512F, AVX512VL, AVX512BW, AVX512DQ, AVX512CD
PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2,
VPCLMULQDQ, AVX512BITALG, RDPID and AVX512VPOPCNTDQ instruction set support.
+@item graniterapids
+Intel graniterapids CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3,
+SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE,
+RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW,
+AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ,
+AVX512CD, PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2,
+VPCLMULQDQ, AVX512BITALG, RDPID, AVX512VPOPCNTDQ, PCONFIG, WBNOINVD, CLWB,
+MOVDIRI, MOVDIR64B, AVX512VP2INTERSECT, ENQCMD, CLDEMOTE, PTWRITE, WAITPKG,
+SERIALIZE, TSXLDTRK, UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512FP16,
+AVX512BF16, AMX-FP16 and PREFETCHI instruction set support.
+
@item k6
AMD K6 CPU with MMX instruction set support.
diff --git a/gcc/testsuite/g++.target/i386/mv16.C b/gcc/testsuite/g++.target/i386/mv16.C
index 683928729..65cc24f32 100644
--- a/gcc/testsuite/g++.target/i386/mv16.C
+++ b/gcc/testsuite/g++.target/i386/mv16.C
@@ -92,6 +92,10 @@ int __attribute__ ((target("arch=rocketlake"))) foo () {
return 24;
}
+int __attribute__ ((target("arch=graniterapids"))) foo () {
+ return 26;
+}
+
int main ()
{
int val = foo ();
@@ -130,6 +134,8 @@ int main ()
assert (val == 23);
else if (__builtin_cpu_is ("rocketlake"))
assert (val == 24);
+ else if (__builtin_cpu_is ("graniterapids"))
+ assert (val == 25);
else
assert (val == 0);
diff --git a/gcc/testsuite/gcc.target/i386/funcspec-56.inc b/gcc/testsuite/gcc.target/i386/funcspec-56.inc
index 9f073f78c..bdcfdbc88 100644
--- a/gcc/testsuite/gcc.target/i386/funcspec-56.inc
+++ b/gcc/testsuite/gcc.target/i386/funcspec-56.inc
@@ -188,6 +188,7 @@ extern void test_arch_cooperlake (void) __attribute__((__target__("arch=
extern void test_arch_sapphirerapids (void) __attribute__((__target__("arch=sapphirerapids")));
extern void test_arch_alderlake (void) __attribute__((__target__("arch=alderlake")));
extern void test_arch_rocketlake (void) __attribute__((__target__("arch=rocketlake")));
+extern void test_arch_graniterapids (void) __attribute__((__target__("arch=graniterapids")));
extern void test_arch_k8 (void) __attribute__((__target__("arch=k8")));
extern void test_arch_k8_sse3 (void) __attribute__((__target__("arch=k8-sse3")));
extern void test_arch_opteron (void) __attribute__((__target__("arch=opteron")));
--
2.28.0.windows.1

View File

@ -0,0 +1,722 @@
From 4f1aff10d93cabe8dfbaf076b6d826a142efb6e1 Mon Sep 17 00:00:00 2001
From: Haochen Jiang <haochen.jiang@intel.com>
Date: Wed, 31 May 2023 10:45:00 +0800
Subject: [PATCH 23/32] Support Intel AMX-COMPLEX
gcc/ChangeLog:
* common/config/i386/cpuinfo.h (get_available_features):
Detect AMX-COMPLEX.
* common/config/i386/i386-common.cc
(OPTION_MASK_ISA2_AMX_COMPLEX_SET,
OPTION_MASK_ISA2_AMX_COMPLEX_UNSET): New.
(ix86_handle_option): Handle -mamx-complex.
* common/config/i386/i386-cpuinfo.h (enum processor_features):
Add FEATURE_AMX_COMPLEX.
* common/config/i386/i386-isas.h: Add ISA_NAME_TABLE_ENTRY for
amx-complex.
* config.gcc: Add amxcomplexintrin.h.
* config/i386/cpuid.h (bit_AMX_COMPLEX): New.
* config/i386/i386-c.cc (ix86_target_macros_internal): Define
__AMX_COMPLEX__.
* config/i386/i386-isa.def (AMX_COMPLEX): Add DEF_PTA(AMX_COMPLEX).
* config/i386/i386-options.cc (ix86_valid_target_attribute_inner_p):
Handle amx-complex.
* config/i386/i386.opt: Add option -mamx-complex.
* config/i386/immintrin.h: Include amxcomplexintrin.h.
* doc/extend.texi: Document amx-complex.
* doc/invoke.texi: Document -mamx-complex.
* doc/sourcebuild.texi: Document target amx-complex.
* config/i386/amxcomplexintrin.h: New file.
gcc/testsuite/ChangeLog:
* g++.dg/other/i386-2.C: Add -mamx-complex.
* g++.dg/other/i386-3.C: Ditto.
* gcc.target/i386/amx-check.h: Add cpu check for AMX-COMPLEX.
* gcc.target/i386/amx-helper.h: Add amx-complex support.
* gcc.target/i386/funcspec-56.inc: Add new target attribute.
* gcc.target/i386/sse-12.c: Add -mamx-complex.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add amx-complex.
* gcc.target/i386/sse-23.c: Ditto.
* lib/target-supports.exp (check_effective_target_amx_complex): New.
* gcc.target/i386/amxcomplex-asmatt-1.c: New test.
* gcc.target/i386/amxcomplex-asmintel-1.c: Ditto.
* gcc.target/i386/amxcomplex-cmmimfp16ps-2.c: Ditto.
* gcc.target/i386/amxcomplex-cmmrlfp16ps-2.c: Ditto.
---
gcc/common/config/i386/cpuinfo.h | 2 +
gcc/common/config/i386/i386-common.cc | 19 +++++-
gcc/common/config/i386/i386-cpuinfo.h | 1 +
gcc/common/config/i386/i386-isas.h | 2 +
gcc/config.gcc | 2 +-
gcc/config/i386/amxcomplexintrin.h | 59 +++++++++++++++++++
gcc/config/i386/cpuid.h | 1 +
gcc/config/i386/i386-c.cc | 2 +
gcc/config/i386/i386-isa.def | 1 +
gcc/config/i386/i386-options.cc | 4 +-
gcc/config/i386/i386.opt | 4 ++
gcc/config/i386/immintrin.h | 2 +
gcc/doc/extend.texi | 5 ++
gcc/doc/invoke.texi | 7 ++-
gcc/doc/sourcebuild.texi | 3 +
gcc/testsuite/g++.dg/other/i386-2.C | 2 +-
gcc/testsuite/g++.dg/other/i386-3.C | 2 +-
gcc/testsuite/gcc.target/i386/amx-check.h | 3 +
gcc/testsuite/gcc.target/i386/amx-helper.h | 4 +-
.../gcc.target/i386/amxcomplex-asmatt-1.c | 15 +++++
.../gcc.target/i386/amxcomplex-asmintel-1.c | 12 ++++
.../i386/amxcomplex-cmmimfp16ps-2.c | 53 +++++++++++++++++
.../i386/amxcomplex-cmmrlfp16ps-2.c | 53 +++++++++++++++++
gcc/testsuite/gcc.target/i386/funcspec-56.inc | 2 +
gcc/testsuite/gcc.target/i386/sse-12.c | 2 +-
gcc/testsuite/gcc.target/i386/sse-13.c | 2 +-
gcc/testsuite/gcc.target/i386/sse-14.c | 2 +-
gcc/testsuite/gcc.target/i386/sse-22.c | 4 +-
gcc/testsuite/gcc.target/i386/sse-23.c | 2 +-
gcc/testsuite/lib/target-supports.exp | 11 ++++
30 files changed, 268 insertions(+), 15 deletions(-)
create mode 100644 gcc/config/i386/amxcomplexintrin.h
create mode 100644 gcc/testsuite/gcc.target/i386/amxcomplex-asmatt-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/amxcomplex-asmintel-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/amxcomplex-cmmimfp16ps-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/amxcomplex-cmmrlfp16ps-2.c
diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 1f75ff1ca..39d3351db 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -798,6 +798,8 @@ get_available_features (struct __processor_model *cpu_model,
{
if (eax & bit_AMX_FP16)
set_feature (FEATURE_AMX_FP16);
+ if (edx & bit_AMX_COMPLEX)
+ set_feature (FEATURE_AMX_COMPLEX);
}
}
diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
index 1aa163463..87e8afe9b 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -109,6 +109,8 @@ along with GCC; see the file COPYING3. If not see
#define OPTION_MASK_ISA2_AMX_BF16_SET OPTION_MASK_ISA2_AMX_BF16
#define OPTION_MASK_ISA2_AMX_FP16_SET OPTION_MASK_ISA2_AMX_FP16
#define OPTION_MASK_ISA2_PREFETCHI_SET OPTION_MASK_ISA2_PREFETCHI
+#define OPTION_MASK_ISA2_AMX_COMPLEX_SET \
+ (OPTION_MASK_ISA2_AMX_TILE | OPTION_MASK_ISA2_AMX_COMPLEX)
/* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
as -msse4.2. */
@@ -269,7 +271,8 @@ along with GCC; see the file COPYING3. If not see
#define OPTION_MASK_ISA2_SERIALIZE_UNSET OPTION_MASK_ISA2_SERIALIZE
#define OPTION_MASK_ISA2_AVX512VP2INTERSECT_UNSET OPTION_MASK_ISA2_AVX512VP2INTERSECT
#define OPTION_MASK_ISA2_TSXLDTRK_UNSET OPTION_MASK_ISA2_TSXLDTRK
-#define OPTION_MASK_ISA2_AMX_TILE_UNSET OPTION_MASK_ISA2_AMX_TILE
+#define OPTION_MASK_ISA2_AMX_TILE_UNSET \
+ (OPTION_MASK_ISA2_AMX_TILE | OPTION_MASK_ISA2_AMX_COMPLEX_UNSET)
#define OPTION_MASK_ISA2_AMX_INT8_UNSET OPTION_MASK_ISA2_AMX_INT8
#define OPTION_MASK_ISA2_AMX_BF16_UNSET OPTION_MASK_ISA2_AMX_BF16
#define OPTION_MASK_ISA2_UINTR_UNSET OPTION_MASK_ISA2_UINTR
@@ -279,6 +282,7 @@ along with GCC; see the file COPYING3. If not see
#define OPTION_MASK_ISA2_WIDEKL_UNSET OPTION_MASK_ISA2_WIDEKL
#define OPTION_MASK_ISA2_AMX_FP16_UNSET OPTION_MASK_ISA2_AMX_FP16
#define OPTION_MASK_ISA2_PREFETCHI_UNSET OPTION_MASK_ISA2_PREFETCHI
+#define OPTION_MASK_ISA2_AMX_COMPLEX_UNSET OPTION_MASK_ISA2_AMX_COMPLEX
/* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same
as -mno-sse4.1. */
@@ -1155,6 +1159,19 @@ ix86_handle_option (struct gcc_options *opts,
}
return true;
+ case OPT_mamx_complex:
+ if (value)
+ {
+ opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AMX_COMPLEX_SET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AMX_COMPLEX_SET;
+ }
+ else
+ {
+ opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AMX_COMPLEX_UNSET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AMX_COMPLEX_UNSET;
+ }
+ return true;
+
case OPT_mfma:
if (value)
{
diff --git a/gcc/common/config/i386/i386-cpuinfo.h b/gcc/common/config/i386/i386-cpuinfo.h
index 7b2d4d242..56020faac 100644
--- a/gcc/common/config/i386/i386-cpuinfo.h
+++ b/gcc/common/config/i386/i386-cpuinfo.h
@@ -243,6 +243,7 @@ enum processor_features
FEATURE_X86_64_V4,
FEATURE_AMX_FP16,
FEATURE_PREFETCHI,
+ FEATURE_AMX_COMPLEX,
CPU_FEATURE_MAX
};
diff --git a/gcc/common/config/i386/i386-isas.h b/gcc/common/config/i386/i386-isas.h
index 6caf06249..cbef68479 100644
--- a/gcc/common/config/i386/i386-isas.h
+++ b/gcc/common/config/i386/i386-isas.h
@@ -177,4 +177,6 @@ ISA_NAMES_TABLE_START
ISA_NAMES_TABLE_ENTRY("x86-64-v4", FEATURE_X86_64_V4, P_X86_64_V4, NULL)
ISA_NAMES_TABLE_ENTRY("amx-fp16", FEATURE_AMX_FP16, P_NONE, "-mamx-fp16")
ISA_NAMES_TABLE_ENTRY("prefetchi", FEATURE_PREFETCHI, P_NONE, "-mprefetchi")
+ ISA_NAMES_TABLE_ENTRY("amx-complex", FEATURE_AMX_COMPLEX,
+ P_NONE, "-mamx-complex")
ISA_NAMES_TABLE_END
diff --git a/gcc/config.gcc b/gcc/config.gcc
index 9bad238e3..ca5c8f8a0 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -424,7 +424,7 @@ i[34567]86-*-* | x86_64-*-*)
amxbf16intrin.h x86gprintrin.h uintrintrin.h
hresetintrin.h keylockerintrin.h avxvnniintrin.h
mwaitintrin.h avx512fp16intrin.h avx512fp16vlintrin.h
- amxfp16intrin.h prfchiintrin.h"
+ amxfp16intrin.h prfchiintrin.h amxcomplexintrin.h"
;;
ia64-*-*)
extra_headers=ia64intrin.h
diff --git a/gcc/config/i386/amxcomplexintrin.h b/gcc/config/i386/amxcomplexintrin.h
new file mode 100644
index 000000000..6ea1eca04
--- /dev/null
+++ b/gcc/config/i386/amxcomplexintrin.h
@@ -0,0 +1,59 @@
+/* Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <amxcomplexintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AMXCOMPLEXINTRIN_H_INCLUDED
+#define _AMXCOMPLEXINTRIN_H_INCLUDED
+
+#if !defined(__AMX_COMPLEX__)
+#pragma GCC push_options
+#pragma GCC target("amx-complex")
+#define __DISABLE_AMX_COMPLEX__
+#endif /* __AMX_COMPLEX__ */
+
+#if defined(__x86_64__)
+#define _tile_cmmimfp16ps_internal(src1_dst,src2,src3) \
+ __asm__ volatile\
+ ("{tcmmimfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tcmmimfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
+
+#define _tile_cmmrlfp16ps_internal(src1_dst,src2,src3) \
+ __asm__ volatile\
+ ("{tcmmrlfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tcmmrlfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
+
+#define _tile_cmmimfp16ps(src1_dst,src2,src3) \
+ _tile_cmmimfp16ps_internal (src1_dst, src2, src3)
+
+#define _tile_cmmrlfp16ps(src1_dst,src2,src3) \
+ _tile_cmmrlfp16ps_internal (src1_dst, src2, src3)
+
+#endif
+
+#ifdef __DISABLE_AMX_COMPLEX__
+#undef __DISABLE_AMX_COMPLEX__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_COMPLEX__ */
+
+#endif /* _AMXCOMPLEXINTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index 21100149a..530a45fad 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -136,6 +136,7 @@
#define bit_AMX_BF16 (1 << 22)
#define bit_AMX_TILE (1 << 24)
#define bit_AMX_INT8 (1 << 25)
+#define bit_AMX_COMPLEX (1 << 8)
/* Extended State Enumeration Sub-leaf (%eax == 0xd, %ecx == 1) */
#define bit_XSAVEOPT (1 << 0)
diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc
index 04f1dd682..5e0ac278c 100644
--- a/gcc/config/i386/i386-c.cc
+++ b/gcc/config/i386/i386-c.cc
@@ -644,6 +644,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
def_or_undef (parse_in, "__AMX_FP16__");
if (isa_flag2 & OPTION_MASK_ISA2_PREFETCHI)
def_or_undef (parse_in, "__PREFETCHI__");
+ if (isa_flag2 & OPTION_MASK_ISA2_AMX_COMPLEX)
+ def_or_undef (parse_in, "__AMX_COMPLEX__");
if (TARGET_IAMCU)
{
def_or_undef (parse_in, "__iamcu");
diff --git a/gcc/config/i386/i386-isa.def b/gcc/config/i386/i386-isa.def
index 744a7df85..7445b1bf7 100644
--- a/gcc/config/i386/i386-isa.def
+++ b/gcc/config/i386/i386-isa.def
@@ -111,3 +111,4 @@ DEF_PTA(AVXVNNI)
DEF_PTA(AVX512FP16)
DEF_PTA(AMX_FP16)
DEF_PTA(PREFETCHI)
+DEF_PTA(AMX_COMPLEX)
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 6645e3259..7efd25084 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -233,7 +233,8 @@ static struct ix86_target_opts isa2_opts[] =
{ "-mavxvnni", OPTION_MASK_ISA2_AVXVNNI },
{ "-mavx512fp16", OPTION_MASK_ISA2_AVX512FP16 },
{ "-mamx-fp16", OPTION_MASK_ISA2_AMX_FP16 },
- { "-mprefetchi", OPTION_MASK_ISA2_PREFETCHI }
+ { "-mprefetchi", OPTION_MASK_ISA2_PREFETCHI },
+ { "-mamx-complex", OPTION_MASK_ISA2_AMX_COMPLEX }
};
static struct ix86_target_opts isa_opts[] =
{
@@ -1080,6 +1081,7 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
IX86_ATTR_ISA ("avx512fp16", OPT_mavx512fp16),
IX86_ATTR_ISA ("amx-fp16", OPT_mamx_fp16),
IX86_ATTR_ISA ("prefetchi", OPT_mprefetchi),
+ IX86_ATTR_ISA ("amx-complex", OPT_mamx_complex),
/* enum options */
IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 50cd114f6..fba94f3f6 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1234,3 +1234,7 @@ Support AMX-FP16 built-in functions and code generation.
mprefetchi
Target Mask(ISA2_PREFETCHI) Var(ix86_isa_flags2) Save
Support PREFETCHI built-in functions and code generation.
+
+mamx-complex
+Target Mask(ISA2_AMX_COMPLEX) Var(ix86_isa_flags2) Save
+Support AMX-COMPLEX built-in functions and code generation.
diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h
index 0447ca4b2..bd819c7f4 100644
--- a/gcc/config/i386/immintrin.h
+++ b/gcc/config/i386/immintrin.h
@@ -124,6 +124,8 @@
#include <amxbf16intrin.h>
+#include <amxcomplexintrin.h>
+
#include <prfchwintrin.h>
#include <keylockerintrin.h>
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index ba9faf4b2..d7b0bc802 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -7048,6 +7048,11 @@ Enable/disable the generation of the AMX-FP16 instructions.
@cindex @code{target("prefetchi")} function attribute, x86
Enable/disable the generation of the PREFETCHI instructions.
+@cindex @code{target("amx-complex")} function attribute, x86
+@item amx-complex
+@itemx no-amx-complex
+Enable/disable the generation of the AMX-COMPLEX instructions.
+
@item cld
@itemx no-cld
@cindex @code{target("cld")} function attribute, x86
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 8ca831dc1..186b33481 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1428,7 +1428,7 @@ See RS/6000 and PowerPC Options.
-mavx5124fmaps -mavx512vnni -mavx5124vnniw -mprfchw -mrdpid @gol
-mrdseed -msgx -mavx512vp2intersect -mserialize -mtsxldtrk@gol
-mamx-tile -mamx-int8 -mamx-bf16 -muintr -mhreset -mavxvnni@gol
--mavx512fp16 -mamx-fp16 -mprefetchi @gol
+-mavx512fp16 -mamx-fp16 -mprefetchi -mamx-complex @gol
-mcldemote -mms-bitfields -mno-align-stringops -minline-all-stringops @gol
-minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
-mkl -mwidekl @gol
@@ -32459,6 +32459,9 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}.
@need 200
@itemx -mprefetchi
@opindex mprefetchi
+@need 200
+@opindex mamx-complex
+@itemx -mamx-complex
These switches enable the use of instructions in the MMX, SSE,
SSE2, SSE3, SSSE3, SSE4, SSE4A, SSE4.1, SSE4.2, AVX, AVX2, AVX512F, AVX512PF,
AVX512ER, AVX512CD, AVX512VL, AVX512BW, AVX512DQ, AVX512IFMA, AVX512VBMI, SHA,
@@ -32469,7 +32472,7 @@ XSAVEOPT, XSAVEC, XSAVES, RTM, HLE, TBM, MWAITX, CLZERO, PKU, AVX512VBMI2,
GFNI, VAES, WAITPKG, VPCLMULQDQ, AVX512BITALG, MOVDIRI, MOVDIR64B, AVX512BF16,
ENQCMD, AVX512VPOPCNTDQ, AVX5124FMAPS, AVX512VNNI, AVX5124VNNIW, SERIALIZE,
UINTR, HRESET, AMXTILE, AMXINT8, AMXBF16, KL, WIDEKL, AVXVNNI, AVX512-FP16,
-AMX-FP16, PREFETCHI or CLDEMOTE extended instruction sets. Each has a corresponding
+AMX-FP16, PREFETCHI, AMX-COMPLEX or CLDEMOTE extended instruction sets. Each has a corresponding
@option{-mno-} option to disable use of these instructions.
These extensions are also available as built-in functions: see
diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
index c68e492dc..454fae11a 100644
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@@ -2472,6 +2472,9 @@ Target supports the execution of @code{amx-int8} instructions.
@item amx_bf16
Target supports the execution of @code{amx-bf16} instructions.
+@item amx_complex
+Target supports the execution of @code{amx-complex} instructions.
+
@item amx_fp16
Target supports the execution of @code{amx-fp16} instructions.
diff --git a/gcc/testsuite/g++.dg/other/i386-2.C b/gcc/testsuite/g++.dg/other/i386-2.C
index 72ed5fed0..ae1b8f632 100644
--- a/gcc/testsuite/g++.dg/other/i386-2.C
+++ b/gcc/testsuite/g++.dg/other/i386-2.C
@@ -1,5 +1,5 @@
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mamx-fp16 -mprefetchi" } */
+/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mamx-fp16 -mprefetchi -mamx-complex" } */
/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
diff --git a/gcc/testsuite/g++.dg/other/i386-3.C b/gcc/testsuite/g++.dg/other/i386-3.C
index 9dd53653f..783e35774 100644
--- a/gcc/testsuite/g++.dg/other/i386-3.C
+++ b/gcc/testsuite/g++.dg/other/i386-3.C
@@ -1,5 +1,5 @@
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mamx-fp16 -mprefetchi" } */
+/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mamx-fp16 -mprefetchi -mamx-complex" } */
/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
diff --git a/gcc/testsuite/gcc.target/i386/amx-check.h b/gcc/testsuite/gcc.target/i386/amx-check.h
index 27dd37bf9..f1a04cf1f 100644
--- a/gcc/testsuite/gcc.target/i386/amx-check.h
+++ b/gcc/testsuite/gcc.target/i386/amx-check.h
@@ -216,6 +216,9 @@ main ()
#ifdef AMX_FP16
&& __builtin_cpu_supports ("amx-fp16")
#endif
+#ifdef AMX_COMPLEX
+ && __builtin_cpu_supports ("amx-complex")
+#endif
#ifdef __linux__
&& request_perm_xtile_data ()
#endif
diff --git a/gcc/testsuite/gcc.target/i386/amx-helper.h b/gcc/testsuite/gcc.target/i386/amx-helper.h
index fe24d7067..6ed9f5eb3 100644
--- a/gcc/testsuite/gcc.target/i386/amx-helper.h
+++ b/gcc/testsuite/gcc.target/i386/amx-helper.h
@@ -1,6 +1,6 @@
#ifndef AMX_HELPER_H_INCLUDED
#define AMX_HELPER_H_INCLUDED
-#if defined(AMX_FP16)
+#if defined(AMX_FP16) || defined(AMX_COMPLEX)
#include <immintrin.h>
#include <xmmintrin.h>
#endif
@@ -12,7 +12,7 @@ typedef union
uint16_t u;
} union16f_uw;
-#if defined(AMX_FP16)
+#if defined(AMX_FP16) || defined(AMX_COMPLEX)
/* Transformation functions between fp16/float */
static uint16_t make_f32_fp16 (float f)
{
diff --git a/gcc/testsuite/gcc.target/i386/amxcomplex-asmatt-1.c b/gcc/testsuite/gcc.target/i386/amxcomplex-asmatt-1.c
new file mode 100644
index 000000000..b6745e34b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/amxcomplex-asmatt-1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mamx-complex" } */
+/* { dg-final { scan-assembler "tcmmimfp16ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */
+/* { dg-final { scan-assembler "tcmmrlfp16ps\[ \\t]+\[^\n\]*%tmm3+\[^\n\]*%tmm2+\[^\n\]*%tmm1" } } */
+#include <immintrin.h>
+
+#define TMM1 1
+#define TMM2 2
+#define TMM3 3
+
+void TEST()
+{
+ _tile_cmmimfp16ps (TMM1, TMM2, TMM3);
+ _tile_cmmrlfp16ps (TMM1, TMM2, TMM3);
+}
diff --git a/gcc/testsuite/gcc.target/i386/amxcomplex-asmintel-1.c b/gcc/testsuite/gcc.target/i386/amxcomplex-asmintel-1.c
new file mode 100644
index 000000000..305465e88
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/amxcomplex-asmintel-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-require-effective-target masm_intel } */
+/* { dg-options "-O2 -mamx-complex -masm=intel" } */
+/* { dg-final { scan-assembler "tcmmimfp16ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
+/* { dg-final { scan-assembler "tcmmrlfp16ps\[ \\t]+\[^\n\]*%tmm1+\[^\n\]*%tmm2+\[^\n\]*%tmm3" } } */
+#include <immintrin.h>
+
+void TEST()
+{
+ _tile_cmmimfp16ps (1, 2, 3);
+ _tile_cmmrlfp16ps (1, 2, 3);
+}
diff --git a/gcc/testsuite/gcc.target/i386/amxcomplex-cmmimfp16ps-2.c b/gcc/testsuite/gcc.target/i386/amxcomplex-cmmimfp16ps-2.c
new file mode 100644
index 000000000..6e3762c9f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/amxcomplex-cmmimfp16ps-2.c
@@ -0,0 +1,53 @@
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target amx_complex } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-options "-O2 -mamx-complex -mavx512fp16" } */
+#define AMX_COMPLEX
+#define DO_TEST test_amx_complex_cmmimfp16ps
+void test_amx_complex_cmmimfp16ps ();
+#include "amx-helper.h"
+
+void calc_matrix_cmmimfp16ps (__tile *dst, __tile *src1, __tile *src2)
+{
+ uint16_t *src1_buf = (uint16_t *) src1->buf;
+ uint16_t *src2_buf = (uint16_t *) src2->buf;
+ float *dst_buf = (float *) dst->buf;
+
+ int M = src1->rows;
+ int N = src1->colsb / 4;
+ int K = src2->colsb / 4;
+ int i, j, k, t;
+
+ for (i = 0; i < M; i++)
+ for (j = 0; j < N; j++)
+ for (k = 0; k < K; k++)
+ for (t = 0; t < 2; t+=2)
+ dst_buf[i * N + k] +=
+ (make_fp16_f32(src1_buf[i * 2 * N + 2 * j + t]) *
+ make_fp16_f32(src2_buf[j * 2 * K + 2 * k + t + 1])) +
+ (make_fp16_f32(src1_buf[i * 2 * N + 2 * j + t + 1]) *
+ make_fp16_f32(src2_buf[j * 2 * K + 2 * k + t]));
+}
+
+void test_amx_complex_cmmimfp16ps ()
+{
+ __tilecfg_u cfg;
+ __tile dst, dst_ref, src1, src2;
+ uint8_t tmp_dst_buf[1024], tmp_dst_zero_buf[1024];
+
+ init_fp16_max_tile_buffer (tmp_dst_buf);
+ init_fp16_max_tile_zero_buffer (tmp_dst_zero_buf);
+
+ init_tile_config (&cfg);
+ init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_zero_buf);
+ init_tile_reg_and_src_with_buffer (2, src1, tmp_dst_buf);
+ init_tile_reg_and_src_with_buffer (3, src2, tmp_dst_buf);
+
+ calc_matrix_cmmimfp16ps (&dst, &src1, &src2);
+
+ _tile_cmmimfp16ps (1, 2, 3);
+ _tile_stored (1, dst_ref.buf, _STRIDE);
+
+ if (!check_tile_register (&dst_ref, &dst))
+ abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/amxcomplex-cmmrlfp16ps-2.c b/gcc/testsuite/gcc.target/i386/amxcomplex-cmmrlfp16ps-2.c
new file mode 100644
index 000000000..15940708a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/amxcomplex-cmmrlfp16ps-2.c
@@ -0,0 +1,53 @@
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target amx_complex } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-options "-O2 -mamx-complex -mavx512fp16" } */
+#define AMX_COMPLEX
+#define DO_TEST test_amx_complex_cmmrlfp16ps
+void test_amx_complex_cmmrlfp16ps();
+#include "amx-helper.h"
+
+void calc_matrix_cmmrlfp16ps (__tile *dst, __tile *src1, __tile *src2)
+{
+ uint16_t *src1_buf = (uint16_t *) src1->buf;
+ uint16_t *src2_buf = (uint16_t *) src2->buf;
+ float *dst_buf = (float *) dst->buf;
+
+ int M = src1->rows;
+ int N = src1->colsb / 4;
+ int K = src2->colsb / 4;
+ int i, j, k, t;
+
+ for (i = 0; i < M; i++)
+ for (j = 0; j < N; j++)
+ for (k = 0; k < K; k++)
+ for (t = 0; t < 2; t+=2)
+ dst_buf[i * N + k] +=
+ (make_fp16_f32 (src1_buf[i * 2 * N + 2 * j + t]) *
+ make_fp16_f32 (src2_buf[j * 2 * K + 2 * k + t])) -
+ (make_fp16_f32 (src1_buf[i * 2 * N + 2 * j + t + 1]) *
+ make_fp16_f32 (src2_buf[j * 2 * K + 2 * k + t + 1]));
+}
+
+void test_amx_complex_cmmrlfp16ps ()
+{
+ __tilecfg_u cfg;
+ __tile dst, dst_ref, src1, src2;
+ uint8_t tmp_dst_buf[1024], tmp_dst_zero_buf[1024];
+
+ init_fp16_max_tile_buffer (tmp_dst_buf);
+ init_fp16_max_tile_zero_buffer (tmp_dst_zero_buf);
+
+ init_tile_config (&cfg);
+ init_tile_reg_and_src_with_buffer (1, dst, tmp_dst_zero_buf);
+ init_tile_reg_and_src_with_buffer (2, src1, tmp_dst_buf);
+ init_tile_reg_and_src_with_buffer (3, src2, tmp_dst_buf);
+
+ calc_matrix_cmmrlfp16ps (&dst, &src1, &src2);
+
+ _tile_cmmrlfp16ps (1, 2, 3);
+ _tile_stored (1, dst_ref.buf, _STRIDE);
+
+ if (!check_tile_register (&dst_ref, &dst))
+ abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/funcspec-56.inc b/gcc/testsuite/gcc.target/i386/funcspec-56.inc
index bdcfdbc88..1a2f3b83d 100644
--- a/gcc/testsuite/gcc.target/i386/funcspec-56.inc
+++ b/gcc/testsuite/gcc.target/i386/funcspec-56.inc
@@ -82,6 +82,7 @@ extern void test_avxvnni (void) __attribute__((__target__("avxvnni")));
extern void test_avx512fp16 (void) __attribute__((__target__("avx512fp16")));
extern void test_amx_fp16 (void) __attribute__((__target__("amx-fp16")));
extern void test_prefetchi (void) __attribute__((__target__("prefetchi")));
+extern void test_amx_complex (void) __attribute__((__target__("amx-complex")));
extern void test_no_sgx (void) __attribute__((__target__("no-sgx")));
extern void test_no_avx5124fmaps(void) __attribute__((__target__("no-avx5124fmaps")));
@@ -165,6 +166,7 @@ extern void test_no_avxvnni (void) __attribute__((__target__("no-avxvnni")));
extern void test_no_avx512fp16 (void) __attribute__((__target__("no-avx512fp16")));
extern void test_no_amx_fp16 (void) __attribute__((__target__("no-amx-fp16")));
extern void test_no_prefetchi (void) __attribute__((__target__("no-prefetchi")));
+extern void test_no_amx_complex (void) __attribute__((__target__("no-amx-complex")));
extern void test_arch_nocona (void) __attribute__((__target__("arch=nocona")));
extern void test_arch_core2 (void) __attribute__((__target__("arch=core2")));
diff --git a/gcc/testsuite/gcc.target/i386/sse-12.c b/gcc/testsuite/gcc.target/i386/sse-12.c
index 9ab4a7e0c..d2aadd506 100644
--- a/gcc/testsuite/gcc.target/i386/sse-12.c
+++ b/gcc/testsuite/gcc.target/i386/sse-12.c
@@ -3,7 +3,7 @@
popcntintrin.h gfniintrin.h and mm_malloc.h are usable
with -O -std=c89 -pedantic-errors. */
/* { dg-do compile } */
-/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512bw -mavx512dq -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mamx-fp16" } */
+/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512bw -mavx512dq -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mamx-fp16 -mamx-complex" } */
#include <x86intrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/sse-13.c b/gcc/testsuite/gcc.target/i386/sse-13.c
index db7c0fc7a..c39382836 100644
--- a/gcc/testsuite/gcc.target/i386/sse-13.c
+++ b/gcc/testsuite/gcc.target/i386/sse-13.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mavx512vp2intersect -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mamx-fp16 -mprefetchi" } */
+/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mavx512vp2intersect -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16 -menqcmd -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mamx-fp16 -mprefetchi -mamx-complex" } */
/* { dg-add-options bind_pic_locally } */
#include <mm_malloc.h>
diff --git a/gcc/testsuite/gcc.target/i386/sse-14.c b/gcc/testsuite/gcc.target/i386/sse-14.c
index eaa1a8d81..c34ac1aec 100644
--- a/gcc/testsuite/gcc.target/i386/sse-14.c
+++ b/gcc/testsuite/gcc.target/i386/sse-14.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -mavx512vl -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mamx-fp16" } */
+/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -mavx512vl -mavx512bf16 -menqcmd -mavx512vp2intersect -mserialize -mtsxldtrk -mamx-tile -mamx-int8 -mamx-bf16 -mkl -mwidekl -mavxvnni -mavx512fp16 -mamx-fp16 -mamx-complex" } */
/* { dg-add-options bind_pic_locally } */
#include <mm_malloc.h>
diff --git a/gcc/testsuite/gcc.target/i386/sse-22.c b/gcc/testsuite/gcc.target/i386/sse-22.c
index 19afe639d..c3667b829 100644
--- a/gcc/testsuite/gcc.target/i386/sse-22.c
+++ b/gcc/testsuite/gcc.target/i386/sse-22.c
@@ -103,7 +103,7 @@
#ifndef DIFFERENT_PRAGMAS
-#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vbmi2,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,amx-fp16")
+#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vbmi2,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,amx-fp16,amx-complex")
#endif
/* Following intrinsics require immediate arguments. They
@@ -220,7 +220,7 @@ test_4 (_mm_cmpestrz, int, __m128i, int, __m128i, int, 1)
/* immintrin.h (AVX/AVX2/RDRND/FSGSBASE/F16C/RTM/AVX512F/SHA) */
#ifdef DIFFERENT_PRAGMAS
-#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,avx512f,avx512er,avx512cd,avx512pf,sha,avx512vl,avx512bw,avx512dq,avx512ifma,avx512vbmi,avx512vbmi2,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,amx-fp16")
+#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,avx512f,avx512er,avx512cd,avx512pf,sha,avx512vl,avx512bw,avx512dq,avx512ifma,avx512vbmi,avx512vbmi2,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,amx-fp16,amx-complex")
#endif
#include <immintrin.h>
test_1 (_cvtss_sh, unsigned short, float, 1)
diff --git a/gcc/testsuite/gcc.target/i386/sse-23.c b/gcc/testsuite/gcc.target/i386/sse-23.c
index 741694e87..756b6eb9c 100644
--- a/gcc/testsuite/gcc.target/i386/sse-23.c
+++ b/gcc/testsuite/gcc.target/i386/sse-23.c
@@ -843,6 +843,6 @@
#define __builtin_ia32_vpclmulqdq_v2di(A, B, C) __builtin_ia32_vpclmulqdq_v2di(A, B, 1)
#define __builtin_ia32_vpclmulqdq_v8di(A, B, C) __builtin_ia32_vpclmulqdq_v8di(A, B, 1)
-#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,xsavec,xsaves,clflushopt,avx512bw,avx512dq,avx512vl,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,avx512vbmi2,vpclmulqdq,avx512bitalg,pconfig,wbnoinvd,avx512bf16,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,amx-fp16,prefetchi")
+#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,xsavec,xsaves,clflushopt,avx512bw,avx512dq,avx512vl,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,avx512vbmi2,vpclmulqdq,avx512bitalg,pconfig,wbnoinvd,avx512bf16,enqcmd,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,amx-fp16,prefetchi,amx-complex")
#include <x86intrin.h>
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 0d83c780c..d404058fd 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -9421,6 +9421,17 @@ proc check_effective_target_avxvnni { } {
} "-mavxvnni" ]
}
+# Return 1 if amx-complex instructions can be compiled.
+proc check_effective_target_amx_complex { } {
+ return [check_no_compiler_messages amx_complex object {
+ void
+ foo ()
+ {
+ __asm__ volatile ("tcmmimfp16ps\t%%tmm1, %%tmm2, %%tmm3" ::);
+ }
+ } "-mamx-complex" ]
+}
+
# Return 1 if sse instructions can be compiled.
proc check_effective_target_sse { } {
return [check_no_compiler_messages sse object {
--
2.28.0.windows.1

View File

@ -0,0 +1,30 @@
From 40469a6119085e4c4741bcaeb9418606d28b40c4 Mon Sep 17 00:00:00 2001
From: Haochen Jiang <haochen.jiang@intel.com>
Date: Fri, 31 Mar 2023 10:49:14 +0800
Subject: [PATCH 24/32] i386: Add AMX-COMPLEX to Granite Rapids
gcc/Changelog:
* config/i386/i386.h (PTA_GRANITERAPIDS): Add PTA_AMX_COMPLEX.
(cherry picked from commit afa87bd5f7b126e20268aa959441cde2e02bba0e)
---
gcc/config/i386/i386.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 75953defc..56d7794dc 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -2358,7 +2358,7 @@ constexpr wide_int_bitmask PTA_ALDERLAKE = PTA_TREMONT | PTA_ADX | PTA_AVX
| PTA_PCONFIG | PTA_PKU | PTA_VAES | PTA_VPCLMULQDQ | PTA_SERIALIZE
| PTA_HRESET | PTA_KL | PTA_WIDEKL | PTA_AVXVNNI;
constexpr wide_int_bitmask PTA_GRANITERAPIDS = PTA_SAPPHIRERAPIDS | PTA_AMX_FP16
- | PTA_PREFETCHI;
+ | PTA_PREFETCHI | PTA_AMX_COMPLEX;
constexpr wide_int_bitmask PTA_KNM = PTA_KNL | PTA_AVX5124VNNIW
| PTA_AVX5124FMAPS | PTA_AVX512VPOPCNTDQ;
constexpr wide_int_bitmask PTA_ZNVER1 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2
--
2.28.0.windows.1

View File

@ -0,0 +1,212 @@
From 125e5d448538f7534e0fe3df9b7947cf41605b51 Mon Sep 17 00:00:00 2001
From: "Mo, Zewei" <zewei.mo@intel.com>
Date: Mon, 3 Jul 2023 11:00:26 +0800
Subject: [PATCH 25/32] Initial Granite Rapids D Support
gcc/ChangeLog:
* common/config/i386/cpuinfo.h
(get_intel_cpu): Handle Granite Rapids D.
* common/config/i386/i386-common.cc:
(processor_alias_table): Add graniterapids-d.
* common/config/i386/i386-cpuinfo.h
(enum processor_subtypes): Add INTEL_COREI7_GRANITERAPIDS_D.
* config.gcc: Add -march=graniterapids-d.
* config/i386/driver-i386.cc (host_detect_local_cpu):
Handle graniterapids-d.
* config/i386/i386.h: (PTA_GRANITERAPIDS_D): New.
* doc/extend.texi: Add graniterapids-d.
* doc/invoke.texi: Ditto.
gcc/testsuite/ChangeLog:
* g++.target/i386/mv16.C: Add graniterapids-d.
* gcc.target/i386/funcspec-56.inc: Handle new march.
(cherry picked from commit a0cb65d34cc141571e870fb3b53b3ff47ae3338d)
---
gcc/common/config/i386/cpuinfo.h | 9 ++++++++-
gcc/common/config/i386/i386-common.cc | 2 ++
gcc/common/config/i386/i386-cpuinfo.h | 1 +
gcc/config.gcc | 3 ++-
gcc/config/i386/driver-i386.cc | 5 ++++-
gcc/config/i386/i386.h | 4 +++-
gcc/doc/extend.texi | 3 +++
gcc/doc/invoke.texi | 11 +++++++++++
gcc/testsuite/g++.target/i386/mv16.C | 6 ++++++
gcc/testsuite/gcc.target/i386/funcspec-56.inc | 1 +
10 files changed, 41 insertions(+), 4 deletions(-)
diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 39d3351db..1e53248ef 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -529,7 +529,6 @@ get_intel_cpu (struct __processor_model *cpu_model,
cpu_model->__cpu_subtype = INTEL_COREI7_SAPPHIRERAPIDS;
break;
case 0xad:
- case 0xae:
/* Granite Rapids. */
cpu = "graniterapids";
CHECK___builtin_cpu_is ("corei7");
@@ -537,6 +536,14 @@ get_intel_cpu (struct __processor_model *cpu_model,
cpu_model->__cpu_type = INTEL_COREI7;
cpu_model->__cpu_subtype = INTEL_COREI7_GRANITERAPIDS;
break;
+ case 0xae:
+ /* Granite Rapids D. */
+ cpu = "graniterapids-d";
+ CHECK___builtin_cpu_is ("corei7");
+ CHECK___builtin_cpu_is ("graniterapids-d");
+ cpu_model->__cpu_type = INTEL_COREI7;
+ cpu_model->__cpu_subtype = INTEL_COREI7_GRANITERAPIDS_D;
+ break;
case 0x17:
case 0x1d:
/* Penryn. */
diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
index 87e8afe9b..28f468f48 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -1993,6 +1993,8 @@ const pta processor_alias_table[] =
M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2},
{"graniterapids", PROCESSOR_GRANITERAPIDS, CPU_HASWELL, PTA_GRANITERAPIDS,
M_CPU_SUBTYPE (INTEL_COREI7_GRANITERAPIDS), P_PROC_AVX512F},
+ {"graniterapids-d", PROCESSOR_GRANITERAPIDS, CPU_HASWELL, PTA_GRANITERAPIDS_D,
+ M_CPU_SUBTYPE (INTEL_COREI7_GRANITERAPIDS_D), P_PROC_AVX512F},
{"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL,
M_CPU_TYPE (INTEL_BONNELL), P_PROC_SSSE3},
{"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL,
diff --git a/gcc/common/config/i386/i386-cpuinfo.h b/gcc/common/config/i386/i386-cpuinfo.h
index 56020faac..a32f32c97 100644
--- a/gcc/common/config/i386/i386-cpuinfo.h
+++ b/gcc/common/config/i386/i386-cpuinfo.h
@@ -93,6 +93,7 @@ enum processor_subtypes
INTEL_COREI7_ROCKETLAKE,
AMDFAM19H_ZNVER4,
INTEL_COREI7_GRANITERAPIDS,
+ INTEL_COREI7_GRANITERAPIDS_D,
CPU_SUBTYPE_MAX
};
diff --git a/gcc/config.gcc b/gcc/config.gcc
index ca5c8f8a0..3108ac4eb 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -670,7 +670,8 @@ slm nehalem westmere sandybridge ivybridge haswell broadwell bonnell \
silvermont knl knm skylake-avx512 cannonlake icelake-client icelake-server \
skylake goldmont goldmont-plus tremont cascadelake tigerlake cooperlake \
sapphirerapids alderlake rocketlake eden-x2 nano nano-1000 nano-2000 nano-3000 \
-nano-x2 eden-x4 nano-x4 x86-64 x86-64-v2 x86-64-v3 x86-64-v4 graniterapids native"
+nano-x2 eden-x4 nano-x4 x86-64 x86-64-v2 x86-64-v3 x86-64-v4 graniterapids \
+graniterapids-d native"
# Additional x86 processors supported by --with-cpu=. Each processor
# MUST be separated by exactly one space.
diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc
index ea8c3d8d1..e3bca4b49 100644
--- a/gcc/config/i386/driver-i386.cc
+++ b/gcc/config/i386/driver-i386.cc
@@ -576,8 +576,11 @@ const char *host_detect_local_cpu (int argc, const char **argv)
/* This is unknown family 0x6 CPU. */
if (has_feature (FEATURE_AVX))
{
+ /* Assume Granite Rapids D. */
+ if (has_feature (FEATURE_AMX_COMPLEX))
+ cpu = "graniterapids-d";
/* Assume Granite Rapids. */
- if (has_feature (FEATURE_AMX_FP16))
+ else if (has_feature (FEATURE_AMX_FP16))
cpu = "graniterapids";
/* Assume Tiger Lake */
else if (has_feature (FEATURE_AVX512VP2INTERSECT))
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 56d7794dc..eda3e5e5b 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -2358,7 +2358,9 @@ constexpr wide_int_bitmask PTA_ALDERLAKE = PTA_TREMONT | PTA_ADX | PTA_AVX
| PTA_PCONFIG | PTA_PKU | PTA_VAES | PTA_VPCLMULQDQ | PTA_SERIALIZE
| PTA_HRESET | PTA_KL | PTA_WIDEKL | PTA_AVXVNNI;
constexpr wide_int_bitmask PTA_GRANITERAPIDS = PTA_SAPPHIRERAPIDS | PTA_AMX_FP16
- | PTA_PREFETCHI | PTA_AMX_COMPLEX;
+ | PTA_PREFETCHI;
+constexpr wide_int_bitmask PTA_GRANITERAPIDS_D = PTA_GRANITERAPIDS
+ | PTA_AMX_COMPLEX;
constexpr wide_int_bitmask PTA_KNM = PTA_KNL | PTA_AVX5124VNNIW
| PTA_AVX5124FMAPS | PTA_AVX512VPOPCNTDQ;
constexpr wide_int_bitmask PTA_ZNVER1 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index d7b0bc802..674db2f1a 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -21837,6 +21837,9 @@ Intel Core i7 Rocketlake CPU.
@item graniterapids
Intel Core i7 graniterapids CPU.
+@item graniterapids-d
+Intel Core i7 graniterapids D CPU.
+
@item bonnell
Intel Atom Bonnell CPU.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 186b33481..a2ec060fd 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -31626,6 +31626,17 @@ MOVDIRI, MOVDIR64B, AVX512VP2INTERSECT, ENQCMD, CLDEMOTE, PTWRITE, WAITPKG,
SERIALIZE, TSXLDTRK, UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512FP16,
AVX512BF16, AMX-FP16 and PREFETCHI instruction set support.
+@item graniterapids-d
+Intel graniterapids D CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3,
+SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE,
+RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW,
+AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ,
+AVX512CD, PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2,
+VPCLMULQDQ, AVX512BITALG, RDPID, AVX512VPOPCNTDQ, PCONFIG, WBNOINVD, CLWB,
+MOVDIRI, MOVDIR64B, AVX512VP2INTERSECT, ENQCMD, CLDEMOTE, PTWRITE, WAITPKG,
+SERIALIZE, TSXLDTRK, UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512FP16,
+AVX512BF16, AMX-FP16, PREFETCHI and AMX-COMPLEX instruction set support.
+
@item k6
AMD K6 CPU with MMX instruction set support.
diff --git a/gcc/testsuite/g++.target/i386/mv16.C b/gcc/testsuite/g++.target/i386/mv16.C
index 65cc24f32..17b1fc722 100644
--- a/gcc/testsuite/g++.target/i386/mv16.C
+++ b/gcc/testsuite/g++.target/i386/mv16.C
@@ -96,6 +96,10 @@ int __attribute__ ((target("arch=graniterapids"))) foo () {
return 26;
}
+int __attribute__ ((target("arch=graniterapids-d"))) foo () {
+ return 28;
+}
+
int main ()
{
int val = foo ();
@@ -136,6 +140,8 @@ int main ()
assert (val == 24);
else if (__builtin_cpu_is ("graniterapids"))
assert (val == 25);
+ else if (__builtin_cpu_is ("graniterapids-d"))
+ assert (val == 26);
else
assert (val == 0);
diff --git a/gcc/testsuite/gcc.target/i386/funcspec-56.inc b/gcc/testsuite/gcc.target/i386/funcspec-56.inc
index 1a2f3b83d..f0f3397a7 100644
--- a/gcc/testsuite/gcc.target/i386/funcspec-56.inc
+++ b/gcc/testsuite/gcc.target/i386/funcspec-56.inc
@@ -191,6 +191,7 @@ extern void test_arch_sapphirerapids (void) __attribute__((__target__("arch=sapp
extern void test_arch_alderlake (void) __attribute__((__target__("arch=alderlake")));
extern void test_arch_rocketlake (void) __attribute__((__target__("arch=rocketlake")));
extern void test_arch_graniterapids (void) __attribute__((__target__("arch=graniterapids")));
+extern void test_arch_graniterapids_d (void) __attribute__((__target__("arch=graniterapids-d")));
extern void test_arch_k8 (void) __attribute__((__target__("arch=k8")));
extern void test_arch_k8_sse3 (void) __attribute__((__target__("arch=k8-sse3")));
extern void test_arch_opteron (void) __attribute__((__target__("arch=opteron")));
--
2.28.0.windows.1

View File

@ -0,0 +1,48 @@
From a809a6a416af4d08f7feeadfdd5d1f5a76a830b5 Mon Sep 17 00:00:00 2001
From: Haochen Jiang <haochen.jiang@intel.com>
Date: Thu, 20 Jul 2023 10:47:18 +0800
Subject: [PATCH 26/32] Correct Granite Rapids{, D} documentation
gcc/Changelog:
* doc/invoke.texi: Remove AVX512VP2INTERSECT in
Granite Rapids{, D} from documentation.
(cherry picked from commit 38daaaa91438d3f635a10bf5d5181c3b29f07df9)
---
gcc/doc/invoke.texi | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index a2ec060fd..4d3eccdb2 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -31622,9 +31622,9 @@ RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW,
AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ,
AVX512CD, PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2,
VPCLMULQDQ, AVX512BITALG, RDPID, AVX512VPOPCNTDQ, PCONFIG, WBNOINVD, CLWB,
-MOVDIRI, MOVDIR64B, AVX512VP2INTERSECT, ENQCMD, CLDEMOTE, PTWRITE, WAITPKG,
-SERIALIZE, TSXLDTRK, UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512FP16,
-AVX512BF16, AMX-FP16 and PREFETCHI instruction set support.
+MOVDIRI, MOVDIR64B, ENQCMD, CLDEMOTE, PTWRITE, WAITPKG, SERIALIZE, TSXLDTRK,
+UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512-FP16, AVX512BF16, AMX-FP16
+and PREFETCHI instruction set support.
@item graniterapids-d
Intel graniterapids D CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3,
@@ -31633,9 +31633,9 @@ RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW,
AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ,
AVX512CD, PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2,
VPCLMULQDQ, AVX512BITALG, RDPID, AVX512VPOPCNTDQ, PCONFIG, WBNOINVD, CLWB,
-MOVDIRI, MOVDIR64B, AVX512VP2INTERSECT, ENQCMD, CLDEMOTE, PTWRITE, WAITPKG,
-SERIALIZE, TSXLDTRK, UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512FP16,
-AVX512BF16, AMX-FP16, PREFETCHI and AMX-COMPLEX instruction set support.
+MOVDIRI, MOVDIR64B, ENQCMD, CLDEMOTE, PTWRITE, WAITPKG, SERIALIZE, TSXLDTRK,
+UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512FP16, AVX512BF16, AMX-FP16,
+PREFETCHI and AMX-COMPLEX instruction set support.
@item k6
AMD K6 CPU with MMX instruction set support.
--
2.28.0.windows.1

View File

@ -0,0 +1,30 @@
From 62852213bc6d3e56804ca05826bb95a3a2fe4eba Mon Sep 17 00:00:00 2001
From: "Hu, Lin1" <lin1.hu@intel.com>
Date: Thu, 15 Dec 2022 15:51:18 +0800
Subject: [PATCH 27/32] i386: Remove Meteorlake's family_model
gcc/ChangeLog:
* common/config/i386/cpuinfo.h (get_intel_cpu): Remove case 0xb5
for meteorlake.
(cherry picked from commit 9e74b7ec0b218364905e3e7de5c41e8148ffc61b)
---
gcc/common/config/i386/cpuinfo.h | 1 -
1 file changed, 1 deletion(-)
diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 1e53248ef..348bc0c12 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -510,7 +510,6 @@ get_intel_cpu (struct __processor_model *cpu_model,
/* Alder Lake. */
case 0xb7:
/* Raptor Lake. */
- case 0xb5:
case 0xaa:
case 0xac:
/* Meteor Lake. */
--
2.28.0.windows.1

View File

@ -0,0 +1,33 @@
From 73042aa18fe70aa30a9c7c760b08e642560ecccd Mon Sep 17 00:00:00 2001
From: "Cui, Lili" <lili.cui@intel.com>
Date: Thu, 29 Jun 2023 03:10:35 +0000
Subject: [PATCH 28/32] x86: Update model values for Alderlake, Rocketlake and
Raptorlake.
Update model values for Alderlake, Rocketlake and Raptorlake according to SDM.
gcc/ChangeLog
* common/config/i386/cpuinfo.h (get_intel_cpu): Remove model value 0xa8
from Rocketlake, move model value 0xbf from Alderlake to Raptorlake.
(cherry picked from commit e510c3be13a8ccdf1fc1b27c2501c126d493f335)
---
gcc/common/config/i386/cpuinfo.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 348bc0c12..f9bcb6fad 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -509,6 +509,7 @@ get_intel_cpu (struct __processor_model *cpu_model,
case 0x9a:
/* Alder Lake. */
case 0xb7:
+ case 0xbf:
/* Raptor Lake. */
case 0xaa:
case 0xac:
--
2.28.0.windows.1

View File

@ -0,0 +1,32 @@
From 3dbe28984e0f9c24d6670cfba42983bc32c08b0a Mon Sep 17 00:00:00 2001
From: "Cui, Lili" <lili.cui@intel.com>
Date: Mon, 14 Aug 2023 02:06:00 +0000
Subject: [PATCH 29/32] x86: Update model values for Raptorlake.
Update model values for Raptorlake according to SDM.
gcc/ChangeLog
* common/config/i386/cpuinfo.h (get_intel_cpu): Add model value 0xba
to Raptorlake.
(cherry picked from commit 614052dd4ea083e086712809c754ffebd9361316)
---
gcc/common/config/i386/cpuinfo.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index f9bcb6fad..da1568fd1 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -509,6 +509,7 @@ get_intel_cpu (struct __processor_model *cpu_model,
case 0x9a:
/* Alder Lake. */
case 0xb7:
+ case 0xba:
case 0xbf:
/* Raptor Lake. */
case 0xaa:
--
2.28.0.windows.1

View File

@ -0,0 +1,159 @@
From 8db0f3cd29bd7f937ffa01dd1100360fbbf5b6f4 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Tue, 22 Aug 2023 18:18:31 +0800
Subject: [PATCH 30/32] Fix target_clone ("arch=graniterapids-d")
Both "graniterapid-d" and "graniterapids" are attached with
PROCESSOR_GRANITERAPID in processor_alias_table but mapped to
different __cpu_subtype in get_intel_cpu.
And get_builtin_code_for_version will try to match the first
PROCESSOR_GRANITERAPIDS in processor_alias_table which maps to
"granitepraids" here.
861 else if (new_target->arch_specified && new_target->arch > 0)
1862 for (i = 0; i < pta_size; i++)
1863 if (processor_alias_table[i].processor == new_target->arch)
1864 {
1865 const pta *arch_info = &processor_alias_table[i];
1866 switch (arch_info->priority)
1867 {
1868 default:
1869 arg_str = arch_info->name;
This mismatch makes dispatch_function_versions check the preidcate
of__builtin_cpu_is ("graniterapids") for "graniterapids-d" and causes
the issue.
The patch explicitly adds PROCESSOR_GRANITERAPIDS_D to make a distinction.
For "alderlake","raptorlake", "meteorlake" they share same isa, cost,
tuning, and mapped to the same __cpu_type/__cpu_subtype in
get_intel_cpu, so no need to add PROCESSOR_RAPTORLAKE and others.
gcc/ChangeLog:
* common/config/i386/i386-common.cc (processor_names): Add new
member graniterapids-s.
* config/i386/i386-options.cc (processor_alias_table): Update
table with and PROCESSOR_GRANITERAPIDS_D.
(m_GRANITERAPID_D): New macro.
(m_CORE_AVX512): Add m_GRANITERAPIDS_D.
(processor_cost_table): Add icelake_cost for
PROCESSOR_GRANITERAPIDS_D.
* config/i386/i386.h (enum processor_type): Add new member
PROCESSOR_GRANITERAPIDS_D.
* config/i386/i386-c.cc (ix86_target_macros_internal): Handle
PROCESSOR_GRANITERAPIDS_D
---
gcc/common/config/i386/i386-common.cc | 6 ++++--
gcc/config/i386/i386-c.cc | 8 ++++++++
gcc/config/i386/i386-options.cc | 4 +++-
gcc/config/i386/i386.h | 3 ++-
4 files changed, 17 insertions(+), 4 deletions(-)
diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
index 28f468f48..bec6801ce 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -1873,6 +1873,7 @@ const char *const processor_names[] =
"alderlake",
"rocketlake",
"graniterapids",
+ "graniterapids-d",
"intel",
"geode",
"k6",
@@ -1993,8 +1994,9 @@ const pta processor_alias_table[] =
M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2},
{"graniterapids", PROCESSOR_GRANITERAPIDS, CPU_HASWELL, PTA_GRANITERAPIDS,
M_CPU_SUBTYPE (INTEL_COREI7_GRANITERAPIDS), P_PROC_AVX512F},
- {"graniterapids-d", PROCESSOR_GRANITERAPIDS, CPU_HASWELL, PTA_GRANITERAPIDS_D,
- M_CPU_SUBTYPE (INTEL_COREI7_GRANITERAPIDS_D), P_PROC_AVX512F},
+ {"graniterapids-d", PROCESSOR_GRANITERAPIDS_D, CPU_HASWELL,
+ PTA_GRANITERAPIDS_D, M_CPU_SUBTYPE (INTEL_COREI7_GRANITERAPIDS_D),
+ P_PROC_AVX512F},
{"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL,
M_CPU_TYPE (INTEL_BONNELL), P_PROC_SSSE3},
{"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL,
diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc
index 5e0ac278c..49f0db2b8 100644
--- a/gcc/config/i386/i386-c.cc
+++ b/gcc/config/i386/i386-c.cc
@@ -246,6 +246,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
def_or_undef (parse_in, "__graniterapids");
def_or_undef (parse_in, "__graniterapids__");
break;
+ case PROCESSOR_GRANITERAPIDS_D:
+ def_or_undef (parse_in, "__graniterapids_d");
+ def_or_undef (parse_in, "__graniterapids_d__");
+ break;
case PROCESSOR_ALDERLAKE:
def_or_undef (parse_in, "__alderlake");
def_or_undef (parse_in, "__alderlake__");
@@ -254,6 +258,7 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
def_or_undef (parse_in, "__rocketlake");
def_or_undef (parse_in, "__rocketlake__");
break;
+
/* use PROCESSOR_max to not set/unset the arch macro. */
case PROCESSOR_max:
break;
@@ -426,6 +431,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
case PROCESSOR_GRANITERAPIDS:
def_or_undef (parse_in, "__tune_graniterapids__");
break;
+ case PROCESSOR_GRANITERAPIDS_D:
+ def_or_undef (parse_in, "__tune_graniterapids_d__");
+ break;
case PROCESSOR_INTEL:
case PROCESSOR_GENERIC:
break;
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 7efd25084..86932d719 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -128,10 +128,11 @@ along with GCC; see the file COPYING3. If not see
#define m_ALDERLAKE (HOST_WIDE_INT_1U<<PROCESSOR_ALDERLAKE)
#define m_ROCKETLAKE (HOST_WIDE_INT_1U<<PROCESSOR_ROCKETLAKE)
#define m_GRANITERAPIDS (HOST_WIDE_INT_1U<<PROCESSOR_GRANITERAPIDS)
+#define m_GRANITERAPIDS_D (HOST_WIDE_INT_1U<<PROCESSOR_GRANITERAPIDS_D)
#define m_CORE_AVX512 (m_SKYLAKE_AVX512 | m_CANNONLAKE \
| m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_CASCADELAKE \
| m_TIGERLAKE | m_COOPERLAKE | m_SAPPHIRERAPIDS \
- | m_ROCKETLAKE | m_GRANITERAPIDS)
+ | m_ROCKETLAKE | m_GRANITERAPIDS | m_GRANITERAPIDS_D)
#define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_CORE_AVX512)
#define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2)
#define m_GOLDMONT (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT)
@@ -764,6 +765,7 @@ static const struct processor_costs *processor_cost_table[] =
&alderlake_cost,
&icelake_cost,
&icelake_cost,
+ &icelake_cost,
&intel_cost,
&geode_cost,
&k6_cost,
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index eda3e5e5b..5052f878d 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -2216,7 +2216,7 @@ extern int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER];
#define DEFAULT_LARGE_SECTION_THRESHOLD 65536
/* Which processor to tune code generation for. These must be in sync
- with processor_target_table in i386.cc. */
+ with processor_cost_table in i386-options.cc. */
enum processor_type
{
@@ -2251,6 +2251,7 @@ enum processor_type
PROCESSOR_ALDERLAKE,
PROCESSOR_ROCKETLAKE,
PROCESSOR_GRANITERAPIDS,
+ PROCESSOR_GRANITERAPIDS_D,
PROCESSOR_INTEL,
PROCESSOR_GEODE,
PROCESSOR_K6,
--
2.28.0.windows.1

View File

@ -0,0 +1,321 @@
From c546aad5d38165e2962456525a0f6a427e03583b Mon Sep 17 00:00:00 2001
From: "Vladimir N. Makarov" <vmakarov@redhat.com>
Date: Thu, 26 Oct 2023 09:50:40 -0400
Subject: [PATCH 31/32] Modfify cost calculation for dealing with equivalences
RISCV target developers reported that pseudos with equivalence used in
a loop can be spilled. Simple changes of heuristics of cost
calculation of pseudos with equivalence or even ignoring equivalences
resulted in numerous testsuite failures on different targets or worse
spec2017 performance. This patch implements more sophisticated cost
calculations of pseudos with equivalences. The patch does not change
RA behaviour for targets still using the old reload pass instead of
LRA. The patch solves the reported problem and improves x86-64
specint2017 a bit (specfp2017 performance stays the same). The patch
takes into account how the equivalence will be used: will it be
integrated into the user insns or require an input reload insn. It
requires additional pass over insns. To compensate RA slow down, the
patch removes a pass over insns in the reload pass used by IRA before.
This also decouples IRA from reload more and will help to remove the
reload pass in the future if it ever happens.
gcc/ChangeLog:
* dwarf2out.cc (reg_loc_descriptor): Use lra_eliminate_regs when
LRA is used.
* ira-costs.cc: Include regset.h.
(equiv_can_be_consumed_p, get_equiv_regno, calculate_equiv_gains):
New functions.
(find_costs_and_classes): Call calculate_equiv_gains and redefine
mem_cost of pseudos with equivs when LRA is used.
* var-tracking.cc: Include ira.h and lra.h.
(vt_initialize): Use lra_eliminate_regs when LRA is used.
---
gcc/dwarf2out.cc | 4 +-
gcc/ira-costs.cc | 169 ++++++++++++++++++++++++++++++++++++++++++--
gcc/var-tracking.cc | 14 +++-
3 files changed, 179 insertions(+), 8 deletions(-)
diff --git a/gcc/dwarf2out.cc b/gcc/dwarf2out.cc
index 0a5c081d8..f0f6f4fd4 100644
--- a/gcc/dwarf2out.cc
+++ b/gcc/dwarf2out.cc
@@ -14263,7 +14263,9 @@ reg_loc_descriptor (rtx rtl, enum var_init_status initialized)
argument pointer and soft frame pointer rtx's.
Use DW_OP_fbreg offset DW_OP_stack_value in this case. */
if ((rtl == arg_pointer_rtx || rtl == frame_pointer_rtx)
- && eliminate_regs (rtl, VOIDmode, NULL_RTX) != rtl)
+ && (ira_use_lra_p
+ ? lra_eliminate_regs (rtl, VOIDmode, NULL_RTX)
+ : eliminate_regs (rtl, VOIDmode, NULL_RTX)) != rtl)
{
dw_loc_descr_ref result = NULL;
diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
index 642fda529..c79311783 100644
--- a/gcc/ira-costs.cc
+++ b/gcc/ira-costs.cc
@@ -30,6 +30,7 @@ along with GCC; see the file COPYING3. If not see
#include "tm_p.h"
#include "insn-config.h"
#include "regs.h"
+#include "regset.h"
#include "ira.h"
#include "ira-int.h"
#include "addresses.h"
@@ -1750,6 +1751,145 @@ process_bb_node_for_costs (ira_loop_tree_node_t loop_tree_node)
process_bb_for_costs (bb);
}
+/* Check that reg REGNO can be changed by TO in INSN. Return true in case the
+ result insn would be valid one. */
+static bool
+equiv_can_be_consumed_p (int regno, rtx to, rtx_insn *insn)
+{
+ validate_replace_src_group (regno_reg_rtx[regno], to, insn);
+ bool res = verify_changes (0);
+ cancel_changes (0);
+ return res;
+}
+
+/* Return true if X contains a pseudo with equivalence. In this case also
+ return the pseudo through parameter REG. If the pseudo is a part of subreg,
+ return the subreg through parameter SUBREG. */
+
+static bool
+get_equiv_regno (rtx x, int &regno, rtx &subreg)
+{
+ subreg = NULL_RTX;
+ if (GET_CODE (x) == SUBREG)
+ {
+ subreg = x;
+ x = SUBREG_REG (x);
+ }
+ if (REG_P (x)
+ && (ira_reg_equiv[REGNO (x)].memory != NULL
+ || ira_reg_equiv[REGNO (x)].constant != NULL))
+ {
+ regno = REGNO (x);
+ return true;
+ }
+ RTX_CODE code = GET_CODE (x);
+ const char *fmt = GET_RTX_FORMAT (code);
+
+ for (int i = GET_RTX_LENGTH (code) - 1; i >= 0; i--)
+ if (fmt[i] == 'e')
+ {
+ if (get_equiv_regno (XEXP (x, i), regno, subreg))
+ return true;
+ }
+ else if (fmt[i] == 'E')
+ {
+ for (int j = 0; j < XVECLEN (x, i); j++)
+ if (get_equiv_regno (XVECEXP (x, i, j), regno, subreg))
+ return true;
+ }
+ return false;
+}
+
+/* A pass through the current function insns. Calculate costs of using
+ equivalences for pseudos and store them in regno_equiv_gains. */
+
+static void
+calculate_equiv_gains (void)
+{
+ basic_block bb;
+ int regno, freq, cost;
+ rtx subreg;
+ rtx_insn *insn;
+ machine_mode mode;
+ enum reg_class rclass;
+ bitmap_head equiv_pseudos;
+
+ ira_assert (allocno_p);
+ bitmap_initialize (&equiv_pseudos, &reg_obstack);
+ for (regno = max_reg_num () - 1; regno >= FIRST_PSEUDO_REGISTER; regno--)
+ if (ira_reg_equiv[regno].init_insns != NULL
+ && (ira_reg_equiv[regno].memory != NULL
+ || (ira_reg_equiv[regno].constant != NULL
+ /* Ignore complicated constants which probably will be placed
+ in memory: */
+ && GET_CODE (ira_reg_equiv[regno].constant) != CONST_DOUBLE
+ && GET_CODE (ira_reg_equiv[regno].constant) != CONST_VECTOR
+ && GET_CODE (ira_reg_equiv[regno].constant) != LABEL_REF)))
+ {
+ rtx_insn_list *x;
+ for (x = ira_reg_equiv[regno].init_insns; x != NULL; x = x->next ())
+ {
+ insn = x->insn ();
+ rtx set = single_set (insn);
+
+ if (set == NULL_RTX || SET_DEST (set) != regno_reg_rtx[regno])
+ break;
+ bb = BLOCK_FOR_INSN (insn);
+ ira_curr_regno_allocno_map
+ = ira_bb_nodes[bb->index].parent->regno_allocno_map;
+ mode = PSEUDO_REGNO_MODE (regno);
+ rclass = pref[COST_INDEX (regno)];
+ ira_init_register_move_cost_if_necessary (mode);
+ if (ira_reg_equiv[regno].memory != NULL)
+ cost = ira_memory_move_cost[mode][rclass][1];
+ else
+ cost = ira_register_move_cost[mode][rclass][rclass];
+ freq = REG_FREQ_FROM_BB (bb);
+ regno_equiv_gains[regno] += cost * freq;
+ }
+ if (x != NULL)
+ /* We found complicated equiv or reverse equiv mem=reg. Ignore
+ them. */
+ regno_equiv_gains[regno] = 0;
+ else
+ bitmap_set_bit (&equiv_pseudos, regno);
+ }
+
+ FOR_EACH_BB_FN (bb, cfun)
+ {
+ freq = REG_FREQ_FROM_BB (bb);
+ ira_curr_regno_allocno_map
+ = ira_bb_nodes[bb->index].parent->regno_allocno_map;
+ FOR_BB_INSNS (bb, insn)
+ {
+ if (!INSN_P (insn) || !get_equiv_regno (PATTERN (insn), regno, subreg)
+ || !bitmap_bit_p (&equiv_pseudos, regno))
+ continue;
+ rtx subst = ira_reg_equiv[regno].memory;
+
+ if (subst == NULL)
+ subst = ira_reg_equiv[regno].constant;
+ ira_assert (subst != NULL);
+ mode = PSEUDO_REGNO_MODE (regno);
+ ira_init_register_move_cost_if_necessary (mode);
+ bool consumed_p = equiv_can_be_consumed_p (regno, subst, insn);
+
+ rclass = pref[COST_INDEX (regno)];
+ if (MEM_P (subst)
+ /* If it is a change of constant into double for example, the
+ result constant probably will be placed in memory. */
+ || (subreg != NULL_RTX && !INTEGRAL_MODE_P (GET_MODE (subreg))))
+ cost = ira_memory_move_cost[mode][rclass][1] + (consumed_p ? 0 : 1);
+ else if (consumed_p)
+ continue;
+ else
+ cost = ira_register_move_cost[mode][rclass][rclass];
+ regno_equiv_gains[regno] -= cost * freq;
+ }
+ }
+ bitmap_clear (&equiv_pseudos);
+}
+
/* Find costs of register classes and memory for allocnos or pseudos
and their best costs. Set up preferred, alternative and allocno
classes for pseudos. */
@@ -1848,6 +1988,12 @@ find_costs_and_classes (FILE *dump_file)
if (pass == 0)
pref = pref_buffer;
+ if (ira_use_lra_p && allocno_p && pass == 1)
+ /* It is a pass through all insns. So do it once and only for RA (not
+ for insn scheduler) when we already found preferable pseudo register
+ classes on the previous pass. */
+ calculate_equiv_gains ();
+
/* Now for each allocno look at how desirable each class is and
find which class is preferred. */
for (i = max_reg_num () - 1; i >= FIRST_PSEUDO_REGISTER; i--)
@@ -1940,6 +2086,17 @@ find_costs_and_classes (FILE *dump_file)
}
if (i >= first_moveable_pseudo && i < last_moveable_pseudo)
i_mem_cost = 0;
+ else if (ira_use_lra_p)
+ {
+ if (equiv_savings > 0)
+ {
+ i_mem_cost = 0;
+ if (ira_dump_file != NULL && internal_flag_ira_verbose > 5)
+ fprintf (ira_dump_file,
+ " Use MEM for r%d as the equiv savings is %d\n",
+ i, equiv_savings);
+ }
+ }
else if (equiv_savings < 0)
i_mem_cost = -equiv_savings;
else if (equiv_savings > 0)
@@ -2378,7 +2535,10 @@ ira_costs (void)
total_allocno_costs = (struct costs *) ira_allocate (max_struct_costs_size
* ira_allocnos_num);
initiate_regno_cost_classes ();
- calculate_elim_costs_all_insns ();
+ if (!ira_use_lra_p)
+ /* Process equivs in reload to update costs through hook
+ ira_adjust_equiv_reg_cost. */
+ calculate_elim_costs_all_insns ();
find_costs_and_classes (ira_dump_file);
setup_allocno_class_and_costs ();
finish_regno_cost_classes ();
@@ -2503,13 +2663,14 @@ ira_tune_allocno_costs (void)
}
}
-/* Add COST to the estimated gain for eliminating REGNO with its
- equivalence. If COST is zero, record that no such elimination is
- possible. */
+/* A hook from the reload pass. Add COST to the estimated gain for eliminating
+ REGNO with its equivalence. If COST is zero, record that no such
+ elimination is possible. */
void
ira_adjust_equiv_reg_cost (unsigned regno, int cost)
{
+ ira_assert (!ira_use_lra_p);
if (cost == 0)
regno_equiv_gains[regno] = 0;
else
diff --git a/gcc/var-tracking.cc b/gcc/var-tracking.cc
index 7c3ad0a55..b10c8c1eb 100644
--- a/gcc/var-tracking.cc
+++ b/gcc/var-tracking.cc
@@ -107,6 +107,8 @@
#include "cfgrtl.h"
#include "cfganal.h"
#include "reload.h"
+#include "ira.h"
+#include "lra.h"
#include "calls.h"
#include "tree-dfa.h"
#include "tree-ssa.h"
@@ -10133,7 +10135,9 @@ vt_initialize (void)
#else
reg = arg_pointer_rtx;
#endif
- elim = eliminate_regs (reg, VOIDmode, NULL_RTX);
+ elim = (ira_use_lra_p
+ ? lra_eliminate_regs (reg, VOIDmode, NULL_RTX)
+ : eliminate_regs (reg, VOIDmode, NULL_RTX));
if (elim != reg)
{
if (GET_CODE (elim) == PLUS)
@@ -10153,7 +10157,9 @@ vt_initialize (void)
reg = arg_pointer_rtx;
fp_cfa_offset = ARG_POINTER_CFA_OFFSET (current_function_decl);
#endif
- elim = eliminate_regs (reg, VOIDmode, NULL_RTX);
+ elim = (ira_use_lra_p
+ ? lra_eliminate_regs (reg, VOIDmode, NULL_RTX)
+ : eliminate_regs (reg, VOIDmode, NULL_RTX));
if (elim != reg)
{
if (GET_CODE (elim) == PLUS)
@@ -10185,7 +10191,9 @@ vt_initialize (void)
#else
reg = arg_pointer_rtx;
#endif
- elim = eliminate_regs (reg, VOIDmode, NULL_RTX);
+ elim = (ira_use_lra_p
+ ? lra_eliminate_regs (reg, VOIDmode, NULL_RTX)
+ : eliminate_regs (reg, VOIDmode, NULL_RTX));
if (elim != reg)
{
if (GET_CODE (elim) == PLUS)
--
2.28.0.windows.1

View File

@ -0,0 +1,49 @@
From 4965473a4211a9feb46a0d168180ab450cb18bcc Mon Sep 17 00:00:00 2001
From: "Vladimir N. Makarov" <vmakarov@redhat.com>
Date: Fri, 27 Oct 2023 08:28:24 -0400
Subject: [PATCH 32/32] Add cost calculation for reg equivalence invariants
My recent patch improving cost calculation for pseudos with equivalence
resulted in failure of gcc.target/arm/eliminate.c on aarch64. This patch
fixes this failure.
gcc/ChangeLog:
* ira-costs.cc: (get_equiv_regno, calculate_equiv_gains):
Process reg equivalence invariants.
---
gcc/ira-costs.cc | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
index c79311783..d33104a30 100644
--- a/gcc/ira-costs.cc
+++ b/gcc/ira-costs.cc
@@ -1777,6 +1777,7 @@ get_equiv_regno (rtx x, int &regno, rtx &subreg)
}
if (REG_P (x)
&& (ira_reg_equiv[REGNO (x)].memory != NULL
+ || ira_reg_equiv[REGNO (x)].invariant != NULL
|| ira_reg_equiv[REGNO (x)].constant != NULL))
{
regno = REGNO (x);
@@ -1819,6 +1820,7 @@ calculate_equiv_gains (void)
for (regno = max_reg_num () - 1; regno >= FIRST_PSEUDO_REGISTER; regno--)
if (ira_reg_equiv[regno].init_insns != NULL
&& (ira_reg_equiv[regno].memory != NULL
+ || ira_reg_equiv[regno].invariant != NULL
|| (ira_reg_equiv[regno].constant != NULL
/* Ignore complicated constants which probably will be placed
in memory: */
@@ -1869,6 +1871,8 @@ calculate_equiv_gains (void)
if (subst == NULL)
subst = ira_reg_equiv[regno].constant;
+ if (subst == NULL)
+ subst = ira_reg_equiv[regno].invariant;
ira_assert (subst != NULL);
mode = PSEUDO_REGNO_MODE (regno);
ira_init_register_move_cost_if_necessary (mode);
--
2.28.0.windows.1

View File

@ -2,7 +2,7 @@
%global gcc_major 12
# Note, gcc_release must be integer, if you want to add suffixes to
# %%{release}, append them after %%{gcc_release} on Release: line.
%global gcc_release 24
%global gcc_release 25
%global _unpackaged_files_terminate_build 0
%global _performance_build 1
@ -193,6 +193,38 @@ Patch52: 0052-Fix-fails-in-IPA-prefetch-src-openEuler-gcc-I96ID7.patch
Patch53: 0053-struct-reorg-Add-Semi-Relayout.patch
Patch54: 0054-Struct-Reorg-Bugfix-for-structure-pointer-compressio.patch
Patch55: 0055-Struct-Reorg-Port-bugfixes-to-GCC-12.3.1.patch
Patch56: 0056-Fix-bug-that-verifying-gimple-failed-when-reorg-leve.patch
Patch57: 0057-AutoFdo-Fix-memory-leaks-in-autofdo.patch
Patch58: 0058-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ-.patch
Patch59: 0059-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch
Patch60: 0060-Make-option-mvzeroupper-independent-of-optimization-.patch
Patch61: 0061-i386-Sync-tune_string-with-arch_string-for-target-at.patch
Patch62: 0062-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch
Patch63: 0063-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch
Patch64: 0064-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch
Patch65: 0065-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch
Patch66: 0066-Software-mitigation-Disable-gather-generation-in-vec.patch
Patch67: 0067-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch
Patch68: 0068-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch
Patch69: 0069-Disparage-slightly-for-the-alternative-which-move-DF.patch
Patch70: 0070-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch
Patch71: 0071-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch
Patch72: 0072-Disable-FMADD-in-chains-for-Zen4-and-generic.patch
Patch73: 0073-Initial-Raptorlake-Support.patch
Patch74: 0074-Initial-Meteorlake-Support.patch
Patch75: 0075-Support-Intel-AMX-FP16-ISA.patch
Patch76: 0076-Support-Intel-prefetchit0-t1.patch
Patch77: 0077-Initial-Granite-Rapids-Support.patch
Patch78: 0078-Support-Intel-AMX-COMPLEX.patch
Patch79: 0079-i386-Add-AMX-COMPLEX-to-Granite-Rapids.patch
Patch80: 0080-Initial-Granite-Rapids-D-Support.patch
Patch81: 0081-Correct-Granite-Rapids-D-documentation.patch
Patch82: 0082-i386-Remove-Meteorlake-s-family_model.patch
Patch83: 0083-x86-Update-model-values-for-Alderlake-Rocketlake-and.patch
Patch84: 0084-x86-Update-model-values-for-Raptorlake.patch
Patch85: 0085-Fix-target_clone-arch-graniterapids-d.patch
Patch86: 0086-Modfify-cost-calculation-for-dealing-with-equivalenc.patch
Patch87: 0087-Add-cost-calculation-for-reg-equivalence-invariants.patch
# Part 3000 ~ 4999
%ifarch loongarch64
@ -847,6 +879,38 @@ not stable, so plugins must be rebuilt any time GCC is updated.
%patch53 -p1
%patch54 -p1
%patch55 -p1
%patch56 -p1
%patch57 -p1
%patch58 -p1
%patch59 -p1
%patch60 -p1
%patch61 -p1
%patch62 -p1
%patch63 -p1
%patch64 -p1
%patch65 -p1
%patch66 -p1
%patch67 -p1
%patch68 -p1
%patch69 -p1
%patch70 -p1
%patch71 -p1
%patch72 -p1
%patch73 -p1
%patch74 -p1
%patch75 -p1
%patch76 -p1
%patch77 -p1
%patch78 -p1
%patch79 -p1
%patch80 -p1
%patch81 -p1
%patch82 -p1
%patch83 -p1
%patch84 -p1
%patch85 -p1
%patch86 -p1
%patch87 -p1
%ifarch loongarch64
%patch3001 -p1
@ -3238,6 +3302,10 @@ end
%doc rpm.doc/changelogs/libcc1/ChangeLog*
%changelog
* Wed Apr 24 2024 Wang Ding <wangding16@huawei.com> - 12.3.1-25
- Type: Sync
- DESC: Sync patch from openeuler/gcc
* Tue Apr 23 2024 laokz <zhangkai@iscas.ac.cn> - 12.3.1-24
- Type: SPEC
- DESC: riscv64 enable libasan, libusan package