From 2707dd64747053a6d946aa268f02f0e9a274877f Mon Sep 17 00:00:00 2001
From: wangding16 <wangding16@huawei.com>
Date: Wed, 24 Apr 2024 12:44:41 +0800
Subject: [PATCH] [Sync] Sync patch from openeuler/gcc

---
 ...ion-Disable-gather-generation-in-vec.patch | 220 ++++++++++++++++++
 ...her-m-no-scatter-to-enable-disable-v.patch | 187 +++++++++++++++
 ...t-modifier-for-fcmaddcph-fmaddcph-fc.patch | 129 ++++++++++
 ...ly-for-the-alternative-which-move-DF.patch | 106 +++++++++
 ...ue-to-vec_merge-pcmp-to-blendvb-spli.patch | 163 +++++++++++++
 ...s-AVX_U128_CLEAN-after-call_insn-who.patch | 151 ++++++++++++
 6 files changed, 956 insertions(+)
 create mode 100644 0066-Software-mitigation-Disable-gather-generation-in-vec.patch
 create mode 100644 0067-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch
 create mode 100644 0068-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch
 create mode 100644 0069-Disparage-slightly-for-the-alternative-which-move-DF.patch
 create mode 100644 0070-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch
 create mode 100644 0071-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch

diff --git a/0066-Software-mitigation-Disable-gather-generation-in-vec.patch b/0066-Software-mitigation-Disable-gather-generation-in-vec.patch
new file mode 100644
index 0000000..32ed4e0
--- /dev/null
+++ b/0066-Software-mitigation-Disable-gather-generation-in-vec.patch
@@ -0,0 +1,220 @@
+From cfffbec938afdc45c31db5ec282ce21ad1ba2dc7 Mon Sep 17 00:00:00 2001
+From: liuhongt <hongtao.liu@intel.com>
+Date: Thu, 10 Aug 2023 11:41:39 +0800
+Subject: [PATCH 11/32] Software mitigation: Disable gather generation in
+ vectorization for GDS affected Intel Processors.
+
+For more details of GDS (Gather Data Sampling), refer to
+https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/gather-data-sampling.html
+
+After microcode update, there's performance regression. To avoid that,
+the patch disables gather generation in autovectorization but uses
+gather scalar emulation instead.
+
+gcc/ChangeLog:
+
+	* config/i386/i386-options.cc (m_GDS): New macro.
+	* config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Don't
+	enable for m_GDS.
+	(X86_TUNE_USE_GATHER_4PARTS): Ditto.
+	(X86_TUNE_USE_GATHER): Ditto.
+
+gcc/testsuite/ChangeLog:
+
+	* gcc.target/i386/avx2-gather-2.c: Adjust options to keep
+	gather vectorization.
+	* gcc.target/i386/avx2-gather-6.c: Ditto.
+	* gcc.target/i386/avx512f-pr88464-1.c: Ditto.
+	* gcc.target/i386/avx512f-pr88464-5.c: Ditto.
+	* gcc.target/i386/avx512vl-pr88464-1.c: Ditto.
+	* gcc.target/i386/avx512vl-pr88464-11.c: Ditto.
+	* gcc.target/i386/avx512vl-pr88464-3.c: Ditto.
+	* gcc.target/i386/avx512vl-pr88464-9.c: Ditto.
+	* gcc.target/i386/pr88531-1b.c: Ditto.
+	* gcc.target/i386/pr88531-1c.c: Ditto.
+
+(cherry picked from commit 3064d1f5c48cb6ce1b4133570dd08ecca8abb52d)
+---
+ gcc/config/i386/i386-options.cc                     | 5 +++++
+ gcc/config/i386/x86-tune.def                        | 9 ++++++---
+ gcc/testsuite/gcc.target/i386/avx2-gather-2.c       | 2 +-
+ gcc/testsuite/gcc.target/i386/avx2-gather-6.c       | 2 +-
+ gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c   | 2 +-
+ gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c   | 2 +-
+ gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c  | 2 +-
+ gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c | 2 +-
+ gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c  | 2 +-
+ gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c  | 2 +-
+ gcc/testsuite/gcc.target/i386/pr88531-1b.c          | 2 +-
+ gcc/testsuite/gcc.target/i386/pr88531-1c.c          | 2 +-
+ 12 files changed, 21 insertions(+), 13 deletions(-)
+
+diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
+index fb2ed942f..9617fc162 100644
+--- a/gcc/config/i386/i386-options.cc
++++ b/gcc/config/i386/i386-options.cc
+@@ -137,6 +137,11 @@ along with GCC; see the file COPYING3.  If not see
+ #define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS)
+ #define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT)
+ #define m_INTEL (HOST_WIDE_INT_1U<<PROCESSOR_INTEL)
++/* Gather Data Sampling / CVE-2022-40982 / INTEL-SA-00828.
++   Software mitigation.  */
++#define m_GDS (m_SKYLAKE | m_SKYLAKE_AVX512 | m_CANNONLAKE \
++	       | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_CASCADELAKE \
++	       | m_TIGERLAKE | m_COOPERLAKE | m_ROCKETLAKE)
+ 
+ #define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE)
+ #define m_K6 (HOST_WIDE_INT_1U<<PROCESSOR_K6)
+diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
+index e6b9e2125..4392709fc 100644
+--- a/gcc/config/i386/x86-tune.def
++++ b/gcc/config/i386/x86-tune.def
+@@ -467,7 +467,8 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
+ /* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
+    elements.  */
+ DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
+-	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE | m_GENERIC))
++	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE
++	    | m_GENERIC | m_GDS))
+ 
+ /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
+    elements.  */
+@@ -477,7 +478,8 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
+ /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
+    elements.  */
+ DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
+-	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 |  m_ALDERLAKE | m_GENERIC))
++	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE
++	    | m_GENERIC | m_GDS))
+ 
+ /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
+    elements.  */
+@@ -487,7 +489,8 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
+ /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
+    elements.  */
+ DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
+-	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE | m_GENERIC))
++	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE
++	    | m_GENERIC | m_GDS))
+ 
+ /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
+    elements.  */
+diff --git a/gcc/testsuite/gcc.target/i386/avx2-gather-2.c b/gcc/testsuite/gcc.target/i386/avx2-gather-2.c
+index ad5ef7310..978924b0f 100644
+--- a/gcc/testsuite/gcc.target/i386/avx2-gather-2.c
++++ b/gcc/testsuite/gcc.target/i386/avx2-gather-2.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O3 -fdump-tree-vect-details -march=skylake" } */
++/* { dg-options "-O3 -fdump-tree-vect-details -march=skylake -mtune=haswell" } */
+ 
+ #include "avx2-gather-1.c"
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/avx2-gather-6.c b/gcc/testsuite/gcc.target/i386/avx2-gather-6.c
+index b9119581a..067b251e3 100644
+--- a/gcc/testsuite/gcc.target/i386/avx2-gather-6.c
++++ b/gcc/testsuite/gcc.target/i386/avx2-gather-6.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O3 -mavx2 -fno-common -fdump-tree-vect-details -mtune=skylake" } */
++/* { dg-options "-O3 -mavx2 -fno-common -fdump-tree-vect-details  -mtune=haswell" } */
+ 
+ #include "avx2-gather-5.c"
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c
+index 06d21bb01..d1a229861 100644
+--- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c
++++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c
+@@ -1,6 +1,6 @@
+ /* PR tree-optimization/88464 */
+ /* { dg-do compile } */
+-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
++/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=haswell -fdump-tree-vect-details" } */
+ /* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 4 "vect" } } */
+ /* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c
+index 462e951fd..d7b0b2b28 100644
+--- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c
++++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c
+@@ -1,6 +1,6 @@
+ /* PR tree-optimization/88464 */
+ /* { dg-do compile } */
+-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
++/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=haswell -fdump-tree-vect-details" } */
+ /* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 4 "vect" } } */
+ /* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c
+index 55a28dddb..07439185e 100644
+--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c
++++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c
+@@ -1,6 +1,6 @@
+ /* PR tree-optimization/88464 */
+ /* { dg-do compile } */
+-/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
++/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=haswell -fdump-tree-vect-details" } */
+ /* { dg-final { scan-tree-dump-times "loop vectorized using 32 byte vectors" 4 "vect" } } */
+ /* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c
+index 969600885..3a9810827 100644
+--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c
++++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c
+@@ -1,6 +1,6 @@
+ /* PR tree-optimization/88464 */
+ /* { dg-do compile } */
+-/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
++/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=haswell -fdump-tree-vect-details" } */
+ /* { dg-final { scan-tree-dump-times "loop vectorized using 16 byte vectors" 4 "vect" } } */
+ /* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c
+index 6b0c8a859..ac669e048 100644
+--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c
++++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c
+@@ -1,6 +1,6 @@
+ /* PR tree-optimization/88464 */
+ /* { dg-do compile } */
+-/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
++/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=haswell -fdump-tree-vect-details" } */
+ /* { dg-final { scan-tree-dump-times "loop vectorized using 16 byte vectors" 4 "vect" } } */
+ /* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c
+index 3af568ab3..14a1083b6 100644
+--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c
++++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c
+@@ -1,6 +1,6 @@
+ /* PR tree-optimization/88464 */
+ /* { dg-do compile } */
+-/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
++/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=haswell -fdump-tree-vect-details" } */
+ /* { dg-final { scan-tree-dump-times "loop vectorized using 32 byte vectors" 4 "vect" } } */
+ /* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/pr88531-1b.c b/gcc/testsuite/gcc.target/i386/pr88531-1b.c
+index 812c8a10f..e6df789de 100644
+--- a/gcc/testsuite/gcc.target/i386/pr88531-1b.c
++++ b/gcc/testsuite/gcc.target/i386/pr88531-1b.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O3 -march=skylake -mfpmath=sse" } */
++/* { dg-options "-O3 -march=skylake -mfpmath=sse -mtune=haswell" } */
+ 
+ #include "pr88531-1a.c"
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/pr88531-1c.c b/gcc/testsuite/gcc.target/i386/pr88531-1c.c
+index 43fc5913e..a093c87c0 100644
+--- a/gcc/testsuite/gcc.target/i386/pr88531-1c.c
++++ b/gcc/testsuite/gcc.target/i386/pr88531-1c.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O3 -march=skylake-avx512 -mfpmath=sse" } */
++/* { dg-options "-O3 -march=skylake-avx512 -mfpmath=sse -mtune=haswell" } */
+ 
+ #include "pr88531-1a.c"
+ 
+-- 
+2.28.0.windows.1
+
diff --git a/0067-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch b/0067-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch
new file mode 100644
index 0000000..2eda756
--- /dev/null
+++ b/0067-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch
@@ -0,0 +1,187 @@
+From c269629130cb23252da2db026ce9ed13f57f69f4 Mon Sep 17 00:00:00 2001
+From: liuhongt <hongtao.liu@intel.com>
+Date: Thu, 10 Aug 2023 16:26:13 +0800
+Subject: [PATCH 12/32] Support -m[no-]gather -m[no-]scatter to enable/disable
+ vectorization for all gather/scatter instructions
+
+Rename original use_gather to use_gather_8parts, Support
+-mtune-ctrl={,^}use_gather to set/clear tune features
+use_gather_{2parts, 4parts, 8parts}. Support the new option -mgather
+as alias of -mtune-ctrl=, use_gather, ^use_gather.
+
+Similar for use_scatter.
+
+gcc/ChangeLog:
+
+	* config/i386/i386-builtins.cc
+	(ix86_vectorize_builtin_gather): Adjust for use_gather_8parts.
+	* config/i386/i386-options.cc (parse_mtune_ctrl_str):
+	Set/Clear tune features use_{gather,scatter}_{2parts, 4parts,
+	8parts} for -mtune-crtl={,^}{use_gather,use_scatter}.
+	* config/i386/i386.cc (ix86_vectorize_builtin_scatter): Adjust
+	for use_scatter_8parts
+	* config/i386/i386.h (TARGET_USE_GATHER): Rename to ..
+	(TARGET_USE_GATHER_8PARTS): .. this.
+	(TARGET_USE_SCATTER): Rename to ..
+	(TARGET_USE_SCATTER_8PARTS): .. this.
+	* config/i386/x86-tune.def (X86_TUNE_USE_GATHER): Rename to
+	(X86_TUNE_USE_GATHER_8PARTS): .. this.
+	(X86_TUNE_USE_SCATTER): Rename to
+	(X86_TUNE_USE_SCATTER_8PARTS): .. this.
+	* config/i386/i386.opt: Add new options mgather, mscatter.
+
+(cherry picked from commit b2a927fb5343db363ea4361da0d6bcee227b6737)
+---
+ gcc/config/i386/i386-builtins.cc |  2 +-
+ gcc/config/i386/i386-options.cc  | 54 +++++++++++++++++++++++---------
+ gcc/config/i386/i386.cc          |  2 +-
+ gcc/config/i386/i386.h           |  8 ++---
+ gcc/config/i386/i386.opt         |  4 +++
+ gcc/config/i386/x86-tune.def     |  4 +--
+ 6 files changed, 52 insertions(+), 22 deletions(-)
+
+diff --git a/gcc/config/i386/i386-builtins.cc b/gcc/config/i386/i386-builtins.cc
+index 050c6228a..8ed32e14f 100644
+--- a/gcc/config/i386/i386-builtins.cc
++++ b/gcc/config/i386/i386-builtins.cc
+@@ -1790,7 +1790,7 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype,
+ 	  ? !TARGET_USE_GATHER_2PARTS
+ 	  : (known_eq (TYPE_VECTOR_SUBPARTS (mem_vectype), 4u)
+ 	     ? !TARGET_USE_GATHER_4PARTS
+-	     : !TARGET_USE_GATHER)))
++	     : !TARGET_USE_GATHER_8PARTS)))
+     return NULL_TREE;
+ 
+   if ((TREE_CODE (index_type) != INTEGER_TYPE
+diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
+index 9617fc162..3df1f0c41 100644
+--- a/gcc/config/i386/i386-options.cc
++++ b/gcc/config/i386/i386-options.cc
+@@ -1705,20 +1705,46 @@ parse_mtune_ctrl_str (struct gcc_options *opts, bool dump)
+           curr_feature_string++;
+           clear = true;
+         }
+-      for (i = 0; i < X86_TUNE_LAST; i++)
+-        {
+-          if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
+-            {
+-              ix86_tune_features[i] = !clear;
+-              if (dump)
+-                fprintf (stderr, "Explicitly %s feature %s\n",
+-                         clear ? "clear" : "set", ix86_tune_feature_names[i]);
+-              break;
+-            }
+-        }
+-      if (i == X86_TUNE_LAST)
+-	error ("unknown parameter to option %<-mtune-ctrl%>: %s",
+-	       clear ? curr_feature_string - 1 : curr_feature_string);
++
++      if (!strcmp (curr_feature_string, "use_gather"))
++	{
++	  ix86_tune_features[X86_TUNE_USE_GATHER_2PARTS] = !clear;
++	  ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS] = !clear;
++	  ix86_tune_features[X86_TUNE_USE_GATHER_8PARTS] = !clear;
++	  if (dump)
++	    fprintf (stderr, "Explicitly %s features use_gather_2parts,"
++		     " use_gather_4parts, use_gather_8parts\n",
++		     clear ? "clear" : "set");
++
++	}
++      else if (!strcmp (curr_feature_string, "use_scatter"))
++	{
++	  ix86_tune_features[X86_TUNE_USE_SCATTER_2PARTS] = !clear;
++	  ix86_tune_features[X86_TUNE_USE_SCATTER_4PARTS] = !clear;
++	  ix86_tune_features[X86_TUNE_USE_SCATTER_8PARTS] = !clear;
++	  if (dump)
++	    fprintf (stderr, "Explicitly %s features use_scatter_2parts,"
++		     " use_scatter_4parts, use_scatter_8parts\n",
++		     clear ? "clear" : "set");
++	}
++      else
++	{
++	  for (i = 0; i < X86_TUNE_LAST; i++)
++	    {
++	      if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
++		{
++		  ix86_tune_features[i] = !clear;
++		  if (dump)
++		    fprintf (stderr, "Explicitly %s feature %s\n",
++			     clear ? "clear" : "set", ix86_tune_feature_names[i]);
++		  break;
++		}
++	    }
++
++	  if (i == X86_TUNE_LAST)
++	    error ("unknown parameter to option %<-mtune-ctrl%>: %s",
++		   clear ? curr_feature_string - 1 : curr_feature_string);
++	}
+       curr_feature_string = next_feature_string;
+     }
+   while (curr_feature_string);
+diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
+index 479fc6010..e75d37023 100644
+--- a/gcc/config/i386/i386.cc
++++ b/gcc/config/i386/i386.cc
+@@ -18937,7 +18937,7 @@ ix86_vectorize_builtin_scatter (const_tree vectype,
+       ? !TARGET_USE_SCATTER_2PARTS
+       : (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
+ 	 ? !TARGET_USE_SCATTER_4PARTS
+-	 : !TARGET_USE_SCATTER))
++	 : !TARGET_USE_SCATTER_8PARTS))
+     return NULL_TREE;
+ 
+   if ((TREE_CODE (index_type) != INTEGER_TYPE
+diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
+index 688aaabd3..aaa136ba0 100644
+--- a/gcc/config/i386/i386.h
++++ b/gcc/config/i386/i386.h
+@@ -403,10 +403,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
+ 	ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS]
+ #define TARGET_USE_SCATTER_4PARTS \
+ 	ix86_tune_features[X86_TUNE_USE_SCATTER_4PARTS]
+-#define TARGET_USE_GATHER \
+-	ix86_tune_features[X86_TUNE_USE_GATHER]
+-#define TARGET_USE_SCATTER \
+-	ix86_tune_features[X86_TUNE_USE_SCATTER]
++#define TARGET_USE_GATHER_8PARTS \
++	ix86_tune_features[X86_TUNE_USE_GATHER_8PARTS]
++#define TARGET_USE_SCATTER_8PARTS \
++	ix86_tune_features[X86_TUNE_USE_SCATTER_8PARTS]
+ #define TARGET_FUSE_CMP_AND_BRANCH_32 \
+ 	ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32]
+ #define TARGET_FUSE_CMP_AND_BRANCH_64 \
+diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
+index 498fb454d..b154110d8 100644
+--- a/gcc/config/i386/i386.opt
++++ b/gcc/config/i386/i386.opt
+@@ -1222,3 +1222,7 @@ Instructions number above which STFL stall penalty can be compensated.
+ munroll-only-small-loops
+ Target Var(ix86_unroll_only_small_loops) Init(0) Save
+ Enable conservative small loop unrolling.
++
++mscatter
++Target Alias(mtune-ctrl=, use_scatter, ^use_scatter)
++Enable vectorization for scatter instruction.
+diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
+index 4392709fc..bdb455d20 100644
+--- a/gcc/config/i386/x86-tune.def
++++ b/gcc/config/i386/x86-tune.def
+@@ -488,13 +488,13 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
+ 
+ /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
+    elements.  */
+-DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
++DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
+ 	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE
+ 	    | m_GENERIC | m_GDS))
+ 
+ /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
+    elements.  */
+-DEF_TUNE (X86_TUNE_USE_SCATTER, "use_scatter",
++DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
+ 	  ~(m_ZNVER4))
+ 
+ /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
+-- 
+2.28.0.windows.1
+
diff --git a/0068-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch b/0068-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch
new file mode 100644
index 0000000..42a8790
--- /dev/null
+++ b/0068-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch
@@ -0,0 +1,129 @@
+From 764518a35e90a3e13c469275da9c3c7002fe1982 Mon Sep 17 00:00:00 2001
+From: liuhongt <hongtao.liu@intel.com>
+Date: Fri, 8 Sep 2023 09:22:43 +0800
+Subject: [PATCH 13/32] Remove constraint modifier % for
+ fcmaddcph/fmaddcph/fcmulcph since there're not commutative.
+
+gcc/ChangeLog:
+
+	PR target/111306
+	PR target/111335
+	* config/i386/sse.md (int_comm): New int_attr.
+	(fma_<complexopname>_<mode><sdc_maskz_name><round_name>):
+	Remove % for Complex conjugate operations since they're not
+	commutative.
+	(fma_<complexpairopname>_<mode>_pair): Ditto.
+	(<avx512>_<complexopname>_<mode>_mask<round_name>): Ditto.
+	(cmul<conj_op><mode>3): Ditto.
+
+gcc/testsuite/ChangeLog:
+
+	* gcc.target/i386/pr111306.c: New test.
+
+(cherry picked from commit f197392a16ffb1327f1d12ff8ff05f9295e015cb)
+---
+ gcc/config/i386/sse.md                   | 16 ++++++++---
+ gcc/testsuite/gcc.target/i386/pr111306.c | 36 ++++++++++++++++++++++++
+ 2 files changed, 48 insertions(+), 4 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.target/i386/pr111306.c
+
+diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
+index 3af159896..f25dd5f2b 100644
+--- a/gcc/config/i386/sse.md
++++ b/gcc/config/i386/sse.md
+@@ -6318,6 +6318,14 @@
+ 	[(UNSPEC_COMPLEX_FMA_PAIR "fmaddc")
+ 	 (UNSPEC_COMPLEX_FCMA_PAIR "fcmaddc")])
+ 
++(define_int_attr int_comm
++	[(UNSPEC_COMPLEX_FMA "")
++	 (UNSPEC_COMPLEX_FMA_PAIR "")
++	 (UNSPEC_COMPLEX_FCMA "")
++	 (UNSPEC_COMPLEX_FCMA_PAIR "")
++	 (UNSPEC_COMPLEX_FMUL "%")
++	 (UNSPEC_COMPLEX_FCMUL "")])
++
+ (define_int_attr conj_op
+ 	[(UNSPEC_COMPLEX_FMA "")
+ 	 (UNSPEC_COMPLEX_FCMA "_conj")
+@@ -6431,7 +6439,7 @@
+ (define_insn "fma_<complexopname>_<mode><sdc_maskz_name><round_name>"
+   [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
+ 	(unspec:VF_AVX512FP16VL
+-	  [(match_operand:VF_AVX512FP16VL 1 "<round_nimm_predicate>" "%v")
++	  [(match_operand:VF_AVX512FP16VL 1 "<round_nimm_predicate>" "<int_comm>v")
+ 	   (match_operand:VF_AVX512FP16VL 2 "<round_nimm_predicate>" "<round_constraint>")
+ 	   (match_operand:VF_AVX512FP16VL 3 "<round_nimm_predicate>" "0")]
+ 	   UNSPEC_COMPLEX_F_C_MA))]
+@@ -6495,7 +6503,7 @@
+ (define_insn "fma_<complexpairopname>_<mode>_pair"
+  [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=&v")
+        (unspec:VF1_AVX512VL
+-	 [(match_operand:VF1_AVX512VL 1 "vector_operand" "%v")
++	 [(match_operand:VF1_AVX512VL 1 "vector_operand" "<int_comm>v")
+ 	  (match_operand:VF1_AVX512VL 2 "bcst_vector_operand" "vmBr")
+ 	  (match_operand:VF1_AVX512VL 3 "vector_operand" "0")]
+ 	  UNSPEC_COMPLEX_F_C_MA_PAIR))]
+@@ -6562,7 +6570,7 @@
+   [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
+ 	(vec_merge:VF_AVX512FP16VL
+ 	  (unspec:VF_AVX512FP16VL
+-	    [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v")
++	    [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "<int_comm>v")
+ 	     (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")
+ 	     (match_operand:VF_AVX512FP16VL 3 "register_operand" "0")]
+ 	     UNSPEC_COMPLEX_F_C_MA)
+@@ -6586,7 +6594,7 @@
+ (define_insn "<avx512>_<complexopname>_<mode><maskc_name><round_name>"
+   [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
+ 	  (unspec:VF_AVX512FP16VL
+-	    [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v")
++	    [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "<int_comm>v")
+ 	     (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")]
+ 	     UNSPEC_COMPLEX_F_C_MUL))]
+   "TARGET_AVX512FP16 && <round_mode512bit_condition>"
+diff --git a/gcc/testsuite/gcc.target/i386/pr111306.c b/gcc/testsuite/gcc.target/i386/pr111306.c
+new file mode 100644
+index 000000000..541725ebd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr111306.c
+@@ -0,0 +1,36 @@
++/* { dg-do run } */
++/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
++/* { dg-require-effective-target avx512fp16 } */
++
++#define AVX512FP16
++#include "avx512f-helper.h"
++
++__attribute__((optimize("O2"),noipa))
++void func1(_Float16 *a, _Float16 *b, int n, _Float16 *c) {
++  __m512h rA = _mm512_loadu_ph(a);
++  for (int i = 0; i < n; i += 32) {
++    __m512h rB = _mm512_loadu_ph(b + i);
++    _mm512_storeu_ph(c + i, _mm512_fcmul_pch(rB, rA));
++  }
++}
++
++void
++test_512 (void)
++{
++  int n = 32;
++  _Float16 a[n], b[n], c[n];
++  _Float16 exp[n];
++  for (int i = 1; i <= n; i++) {
++    a[i - 1] = i & 1 ? -i : i;
++    b[i - 1] = i;
++  }
++
++  func1(a, b, n, c);
++  for (int i = 0; i < n / 32; i += 2) {
++    if (c[i] != a[i] * b[i] + a[i+1] * b[i+1]
++	|| c[i+1] != a[i] * b[i+1] - a[i+1]*b[i])
++      __builtin_abort ();
++    }
++}
++
++
+-- 
+2.28.0.windows.1
+
diff --git a/0069-Disparage-slightly-for-the-alternative-which-move-DF.patch b/0069-Disparage-slightly-for-the-alternative-which-move-DF.patch
new file mode 100644
index 0000000..7943ef1
--- /dev/null
+++ b/0069-Disparage-slightly-for-the-alternative-which-move-DF.patch
@@ -0,0 +1,106 @@
+From afd539adfe762adb57863299a11987b7e20e7987 Mon Sep 17 00:00:00 2001
+From: liuhongt <hongtao.liu@intel.com>
+Date: Wed, 5 Jul 2023 13:45:11 +0800
+Subject: [PATCH 14/32] Disparage slightly for the alternative which move
+ DFmode between SSE_REGS and GENERAL_REGS.
+
+For testcase
+
+void __cond_swap(double* __x, double* __y) {
+  bool __r = (*__x < *__y);
+  auto __tmp = __r ? *__x : *__y;
+  *__y = __r ? *__y : *__x;
+  *__x = __tmp;
+}
+
+GCC-14 with -O2 and -march=x86-64 options generates the following code:
+
+__cond_swap(double*, double*):
+        movsd   xmm1, QWORD PTR [rdi]
+        movsd   xmm0, QWORD PTR [rsi]
+        comisd  xmm0, xmm1
+        jbe     .L2
+        movq    rax, xmm1
+        movapd  xmm1, xmm0
+        movq    xmm0, rax
+.L2:
+        movsd   QWORD PTR [rsi], xmm1
+        movsd   QWORD PTR [rdi], xmm0
+        ret
+
+rax is used to save and restore DFmode value. In RA both GENERAL_REGS
+and SSE_REGS cost zero since we didn't disparage the
+alternative in movdf_internal pattern, according to register
+allocation order, GENERAL_REGS is allocated. The patch add ? for
+alternative (r,v) and (v,r) just like we did for movsf/hf/bf_internal
+pattern, after that we get optimal RA.
+
+__cond_swap:
+.LFB0:
+	.cfi_startproc
+	movsd	(%rdi), %xmm1
+	movsd	(%rsi), %xmm0
+	comisd	%xmm1, %xmm0
+	jbe	.L2
+	movapd	%xmm1, %xmm2
+	movapd	%xmm0, %xmm1
+	movapd	%xmm2, %xmm0
+.L2:
+	movsd	%xmm1, (%rsi)
+	movsd	%xmm0, (%rdi)
+	ret
+
+gcc/ChangeLog:
+
+	PR target/110170
+	* config/i386/i386.md (movdf_internal): Disparage slightly for
+	2 alternatives (r,v) and (v,r) by adding constraint modifier
+	'?'.
+
+gcc/testsuite/ChangeLog:
+
+	* gcc.target/i386/pr110170-3.c: New test.
+
+(cherry picked from commit 37a231cc7594d12ba0822077018aad751a6fb94e)
+---
+ gcc/config/i386/i386.md                    |  4 ++--
+ gcc/testsuite/gcc.target/i386/pr110170-3.c | 11 +++++++++++
+ 2 files changed, 13 insertions(+), 2 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.target/i386/pr110170-3.c
+
+diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
+index be07be10d..71691f598 100644
+--- a/gcc/config/i386/i386.md
++++ b/gcc/config/i386/i386.md
+@@ -3582,9 +3582,9 @@
+ ;; Possible store forwarding (partial memory) stall in alternatives 4, 6 and 7.
+ (define_insn "*movdf_internal"
+   [(set (match_operand:DF 0 "nonimmediate_operand"
+-    "=Yf*f,m   ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,r ,v,r  ,o ,r  ,m")
++    "=Yf*f,m   ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,?r,?v,r  ,o ,r  ,m")
+ 	(match_operand:DF 1 "general_operand"
+-    "Yf*fm,Yf*f,G   ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x,v,r ,roF,rF,rmF,rC"))]
++    "Yf*fm,Yf*f,G   ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x, v, r,roF,rF,rmF,rC"))]
+   "!(MEM_P (operands[0]) && MEM_P (operands[1]))
+    && (lra_in_progress || reload_completed
+        || !CONST_DOUBLE_P (operands[1])
+diff --git a/gcc/testsuite/gcc.target/i386/pr110170-3.c b/gcc/testsuite/gcc.target/i386/pr110170-3.c
+new file mode 100644
+index 000000000..70daa89e9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr110170-3.c
+@@ -0,0 +1,11 @@
++/* { dg-do compile { target { ! ia32 } } } */
++/* { dg-options "-O2 -fno-if-conversion -fno-if-conversion2" } */
++/* { dg-final { scan-assembler-not {(?n)movq.*r} } } */
++
++void __cond_swap(double* __x, double* __y) {
++  _Bool __r = (*__x < *__y);
++  double __tmp = __r ? *__x : *__y;
++  *__y = __r ? *__y : *__x;
++  *__x = __tmp;
++}
++
+-- 
+2.28.0.windows.1
+
diff --git a/0070-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch b/0070-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch
new file mode 100644
index 0000000..7f0939c
--- /dev/null
+++ b/0070-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch
@@ -0,0 +1,163 @@
+From 88516507757932c1e67ce99d240596935971d2d0 Mon Sep 17 00:00:00 2001
+From: liuhongt <hongtao.liu@intel.com>
+Date: Thu, 9 Nov 2023 13:20:05 +0800
+Subject: [PATCH 15/32] Fix wrong code due to vec_merge + pcmp to blendvb
+ splitter.
+
+gcc/ChangeLog:
+
+	PR target/112443
+	* config/i386/sse.md (*avx2_pcmp<mode>3_4): Fix swap condition
+	from LT to GT since there's not in the pattern.
+	(*avx2_pcmp<mode>3_5): Ditto.
+
+gcc/testsuite/ChangeLog:
+
+	* g++.target/i386/pr112443.C: New test.
+
+(cherry picked from commit 9a0cc04b9c9b02426762892b88efc5c44ba546bd)
+---
+ gcc/config/i386/sse.md                   |   4 +-
+ gcc/testsuite/g++.target/i386/pr112443.C | 108 +++++++++++++++++++++++
+ 2 files changed, 110 insertions(+), 2 deletions(-)
+ create mode 100644 gcc/testsuite/g++.target/i386/pr112443.C
+
+diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
+index f25dd5f2b..23b858ab2 100644
+--- a/gcc/config/i386/sse.md
++++ b/gcc/config/i386/sse.md
+@@ -16358,7 +16358,7 @@
+ 	     (match_dup 4))]
+ 	     UNSPEC_BLENDV))]
+ {
+-  if (INTVAL (operands[5]) == 1)
++  if (INTVAL (operands[5]) == 5)
+     std::swap (operands[1], operands[2]);
+   operands[3] = gen_lowpart (<MODE>mode, operands[3]);
+ })
+@@ -16388,7 +16388,7 @@
+ 	     (match_dup 4))]
+ 	     UNSPEC_BLENDV))]
+ {
+-  if (INTVAL (operands[5]) == 1)
++  if (INTVAL (operands[5]) == 5)
+     std::swap (operands[1], operands[2]);
+ })
+ 
+diff --git a/gcc/testsuite/g++.target/i386/pr112443.C b/gcc/testsuite/g++.target/i386/pr112443.C
+new file mode 100644
+index 000000000..ebfa9b4a7
+--- /dev/null
++++ b/gcc/testsuite/g++.target/i386/pr112443.C
+@@ -0,0 +1,108 @@
++/* { dg-do run } */
++/* { dg-require-effective-target avx512bw } */
++/* { dg-require-effective-target avx512vl } */
++/* { dg-options "-O2 -std=c++17 -mavx512bw -mavx512vl" } */
++
++#include <cstdint>
++#include <x86intrin.h>
++#include <functional>
++#include <ostream>
++
++#define AVX512BW
++#define AVX512VL
++
++#include "avx512f-helper.h"
++
++struct TensorIteratorBase{
++  char* in;
++  char* out;
++
++  void for_each(std::function<void(char*, char*, int64_t size)> loop){
++    loop(out, in, 32);
++  }    
++};
++
++class Vectorized {
++protected:
++  __m256i values;
++
++  static inline __m256i invert(const __m256i& v) {
++    const auto ones = _mm256_set1_epi64x(-1);
++    return _mm256_xor_si256(ones, v);
++  }
++public:
++  operator __m256i() const {
++    return values;
++  }
++
++  static constexpr int size() {
++    return 32;
++  }
++
++  Vectorized() {}
++  Vectorized(__m256i v) : values(v) {}
++  Vectorized(uint8_t v) { values = _mm256_set1_epi8(v); }
++  static Vectorized blendv(const Vectorized& a, const Vectorized& b,
++			   const Vectorized& mask) {
++    return _mm256_blendv_epi8(a, b, mask);
++  }
++  static Vectorized loadu(const void* ptr) {
++    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
++  }
++  void store(void* ptr) const {
++    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
++  }
++
++  Vectorized operator<(const Vectorized& other) const {
++    __m256i max = _mm256_max_epu8(values, other);
++    return invert(_mm256_cmpeq_epi8(max, values));
++  }
++  Vectorized operator-(const Vectorized& b) {
++    return _mm256_sub_epi8(values, b);
++  }
++};
++
++std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) {
++  uint8_t buf[Vectorized::size()];
++  vec.store(buf);
++  stream << "vec[";
++  for (int i = 0; i != Vectorized::size(); i++) {
++    if (i != 0)
++      stream << ", ";
++    stream << buf[i]*1;
++  }
++  stream << "]";
++  return stream;
++}
++
++void run(TensorIteratorBase iter){
++  Vectorized zero_vec(0);
++  Vectorized one_vec(1);
++
++  iter.for_each([=](char* out, char* in, int64_t size) {
++    for (int64_t i = 0; i <= size - Vectorized::size(); i += Vectorized::size()) {
++      auto self_vec = Vectorized::loadu(in + i);
++      auto left = Vectorized::blendv(zero_vec, one_vec, zero_vec < self_vec);
++      auto right = Vectorized::blendv(zero_vec, one_vec, self_vec < zero_vec);
++      auto outv = left - right;
++      outv.store(out + i);
++    }
++  });
++}
++
++void
++test_256 (){
++  char in[32];
++  char out[32];
++  for(auto& x: in) x = 1;
++  run(TensorIteratorBase{in, out});
++  Vectorized::loadu (out);
++  for (int i = 0; i != 32; i++)
++    if (out[i] != 1)
++      __builtin_abort ();
++}
++
++void
++test_128 ()
++{
++}
+-- 
+2.28.0.windows.1
+
diff --git a/0071-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch b/0071-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch
new file mode 100644
index 0000000..e0e2f1f
--- /dev/null
+++ b/0071-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch
@@ -0,0 +1,151 @@
+From 204ffa7f503411ccac0161c951726274648b6374 Mon Sep 17 00:00:00 2001
+From: liuhongt <hongtao.liu@intel.com>
+Date: Thu, 7 Dec 2023 09:17:27 +0800
+Subject: [PATCH 16/32] Don't assume it's AVX_U128_CLEAN after call_insn whose
+ abi.mode_clobber(V4DImode) deosn't contains all SSE_REGS.
+
+If the function desn't clobber any sse registers or only clobber
+128-bit part, then vzeroupper isn't issued before the function exit.
+the status not CLEAN but ANY after the function.
+
+Also for sibling_call, it's safe to issue an vzeroupper. Also there
+could be missing vzeroupper since there's no mode_exit for
+sibling_call_p.
+
+gcc/ChangeLog:
+
+	PR target/112891
+	* config/i386/i386.cc (ix86_avx_u128_mode_after): Return
+	AVX_U128_ANY if callee_abi doesn't clobber all_sse_regs to
+	align with ix86_avx_u128_mode_needed.
+	(ix86_avx_u128_mode_needed): Return AVX_U128_ClEAN for
+	sibling_call.
+
+gcc/testsuite/ChangeLog:
+
+	* gcc.target/i386/pr112891.c: New test.
+	* gcc.target/i386/pr112891-2.c: New test.
+
+(cherry picked from commit fc189a08f5b7ad5889bd4c6b320c1dd99dd5d642)
+---
+ gcc/config/i386/i386.cc                    | 22 +++++++++++++---
+ gcc/testsuite/gcc.target/i386/pr112891-2.c | 30 ++++++++++++++++++++++
+ gcc/testsuite/gcc.target/i386/pr112891.c   | 29 +++++++++++++++++++++
+ 3 files changed, 78 insertions(+), 3 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.target/i386/pr112891-2.c
+ create mode 100644 gcc/testsuite/gcc.target/i386/pr112891.c
+
+diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
+index e75d37023..60f3296b0 100644
+--- a/gcc/config/i386/i386.cc
++++ b/gcc/config/i386/i386.cc
+@@ -14416,8 +14416,12 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
+ 	 modes wider than 256 bits.  It's only safe to issue a
+ 	 vzeroupper if all SSE registers are clobbered.  */
+       const function_abi &abi = insn_callee_abi (insn);
+-      if (!hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
+-				  abi.mode_clobbers (V4DImode)))
++      /* Should be safe to issue an vzeroupper before sibling_call_p.
++	 Also there not mode_exit for sibling_call, so there could be
++	 missing vzeroupper for that.  */
++      if (!(SIBLING_CALL_P (insn)
++	    || hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
++				      abi.mode_clobbers (V4DImode))))
+ 	return AVX_U128_ANY;
+ 
+       return AVX_U128_CLEAN;
+@@ -14555,7 +14559,19 @@ ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
+       bool avx_upper_reg_found = false;
+       note_stores (insn, ix86_check_avx_upper_stores, &avx_upper_reg_found);
+ 
+-      return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
++      if (avx_upper_reg_found)
++	return AVX_U128_DIRTY;
++
++      /* If the function desn't clobber any sse registers or only clobber
++	 128-bit part, Then vzeroupper isn't issued before the function exit.
++	 the status not CLEAN but ANY after the function.  */
++      const function_abi &abi = insn_callee_abi (insn);
++      if (!(SIBLING_CALL_P (insn)
++	    || hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
++				      abi.mode_clobbers (V4DImode))))
++	return AVX_U128_ANY;
++
++      return  AVX_U128_CLEAN;
+     }
+ 
+   /* Otherwise, return current mode.  Remember that if insn
+diff --git a/gcc/testsuite/gcc.target/i386/pr112891-2.c b/gcc/testsuite/gcc.target/i386/pr112891-2.c
+new file mode 100644
+index 000000000..164c3985d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr112891-2.c
+@@ -0,0 +1,30 @@
++/* { dg-do compile } */
++/* { dg-options "-mavx2 -O3" } */
++/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
++
++void
++__attribute__((noinline))
++bar (double* a)
++{
++  a[0] = 1.0;
++  a[1] = 2.0;
++}
++
++double
++__attribute__((noinline))
++foo (double* __restrict a, double* b)
++{
++  a[0] += b[0];
++  a[1] += b[1];
++  a[2] += b[2];
++  a[3] += b[3];
++  bar (b);
++  return a[5] + b[5];
++}
++
++double
++foo1 (double* __restrict a, double* b)
++{
++  double c = foo (a, b);
++  return __builtin_exp (c);
++}
+diff --git a/gcc/testsuite/gcc.target/i386/pr112891.c b/gcc/testsuite/gcc.target/i386/pr112891.c
+new file mode 100644
+index 000000000..dbf6c6794
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr112891.c
+@@ -0,0 +1,29 @@
++/* { dg-do compile } */
++/* { dg-options "-mavx2 -O3" } */
++/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
++
++void
++__attribute__((noinline))
++bar (double* a)
++{
++  a[0] = 1.0;
++  a[1] = 2.0;
++}
++
++void
++__attribute__((noinline))
++foo (double* __restrict a, double* b)
++{
++  a[0] += b[0];
++  a[1] += b[1];
++  a[2] += b[2];
++  a[3] += b[3];
++  bar (b);
++}
++
++double
++foo1 (double* __restrict a, double* b)
++{
++  foo (a, b);
++  return __builtin_exp (b[1]);
++}
+-- 
+2.28.0.windows.1
+