!320 Backport some patches on EPOL gcc-12 to GCC-12 for openEuler.

From: @github-27907959 Reviewed-by: @huang-xiaoquan Signed-off-by: @huang-xiaoquan
2023-08-17 07:59:35 +00:00 · 2023-08-17 07:59:35 +00:00 · a8340ebb3e
commit a8340ebb3e
parent 110ceef4aa ad4ee2e6e9
4 changed files with 863 additions and 1 deletions
--- a/0003-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
+++ b/0003-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
@ -0,0 +1,124 @@
+From 355eb8e20327242442d139fb052d3a3befde3dd7 Mon Sep 17 00:00:00 2001
+From: "Cui,Lili" <lili.cui@intel.com>
+Date: Tue, 1 Nov 2022 09:16:49 +0800
+Subject: [PATCH] Add attribute hot judgement for INLINE_HINT_known_hot
+ hint.
+
+We set up INLINE_HINT_known_hot hint only when we have profile feedback,
+now add function attribute judgement for it, when both caller and callee
+have __attribute__((hot)), we will also set up INLINE_HINT_known_hot hint
+for it.
+
+With this patch applied,
+ADL Multi-copy:    538.imagic_r  16.7%
+ICX Multi-copy:    538.imagic_r  15.2%
+CLX Multi-copy:    538.imagic_r  12.7%
+Znver3 Multi-copy: 538.imagic_r  10.6%
+Arm Multi-copy:    538.imagic_r  13.4%
+
+gcc/ChangeLog
+
+	* ipa-inline-analysis.cc (do_estimate_edge_time): Add function attribute
+	judgement for INLINE_HINT_known_hot hint.
+
+gcc/testsuite/ChangeLog:
+
+	* gcc.dg/ipa/inlinehint-6.c: New test.
+---
+ gcc/ipa-inline-analysis.cc              | 13 ++++---
+ gcc/testsuite/gcc.dg/ipa/inlinehint-6.c | 47 +++++++++++++++++++++++++
+ 2 files changed, 56 insertions(+), 4 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.dg/ipa/inlinehint-6.c
+
+diff --git a/gcc/ipa-inline-analysis.cc b/gcc/ipa-inline-analysis.cc
+index 11d8d09ee..16ac24cfc 100644
+--- a/gcc/ipa-inline-analysis.cc
+++ b/gcc/ipa-inline-analysis.cc
+@@ -48,6 +48,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "ipa-utils.h"
+ #include "cfgexpand.h"
+ #include "gimplify.h"
+#include "attribs.h"
+ 
+ /* Cached node/edge growths.  */
+ fast_call_summary<edge_growth_cache_entry *, va_heap> *edge_growth_cache = NULL;
+@@ -249,15 +250,19 @@ do_estimate_edge_time (struct cgraph_edge *edge, sreal *ret_nonspec_time)
+       hints = estimates.hints;
+     }
+ 
+-  /* When we have profile feedback, we can quite safely identify hot
+-     edges and for those we disable size limits.  Don't do that when
+-     probability that caller will call the callee is low however, since it
+  /* When we have profile feedback or function attribute, we can quite safely
+     identify hot edges and for those we disable size limits.  Don't do that
+     when probability that caller will call the callee is low however, since it
+      may hurt optimization of the caller's hot path.  */
+-  if (edge->count.ipa ().initialized_p () && edge->maybe_hot_p ()
+  if ((edge->count.ipa ().initialized_p () && edge->maybe_hot_p ()
+       && (edge->count.ipa ().apply_scale (2, 1)
+ 	  > (edge->caller->inlined_to
+ 	     ? edge->caller->inlined_to->count.ipa ()
+ 	     : edge->caller->count.ipa ())))
+      || (lookup_attribute ("hot", DECL_ATTRIBUTES (edge->caller->decl))
+	  != NULL
+	 && lookup_attribute ("hot", DECL_ATTRIBUTES (edge->callee->decl))
+	  != NULL))
+     hints |= INLINE_HINT_known_hot;
+ 
+   gcc_checking_assert (size >= 0);
+diff --git a/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c b/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c
+new file mode 100644
+index 000000000..1f3be641c
+--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ipa/inlinehint-6.c
+@@ -0,0 +1,47 @@
+/* { dg-options "-O3 -c -fdump-ipa-inline-details -fno-early-inlining -fno-ipa-cp"  } */
+/* { dg-add-options bind_pic_locally } */
+
+#define size_t long long int
+
+struct A
+{
+  size_t f1, f2, f3, f4;
+};
+struct C
+{
+  struct A a;
+  size_t b;
+};
+struct C x;
+
+__attribute__((hot)) struct C callee (struct A *a, struct C *c)
+{
+  c->a=(*a);
+
+  if((c->b + 7) & 17)
+   {
+      c->a.f1 = c->a.f2 + c->a.f1;
+      c->a.f2 = c->a.f3 - c->a.f2;
+      c->a.f3 = c->a.f2 + c->a.f3;
+      c->a.f4 = c->a.f2 - c->a.f4;
+      c->b = c->a.f2;
+
+    }
+  return *c;
+}
+
+__attribute__((hot)) struct C caller (size_t d, size_t e, size_t f, size_t g, struct C *c)
+{
+  struct A a;
+  a.f1 = 1 + d;
+  a.f2 = e;
+  a.f3 = 12 + f;
+  a.f4 = 68 + g;
+  if (c->b > 0)
+    return callee (&a, c);
+  else
+    return *c;
+}
+
+/* { dg-final { scan-ipa-dump "known_hot"  "inline"  } } */
+
+-- 
+2.31.1
+
--- a/0004-Enable-small-loop-unrolling-for-O2.patch
+++ b/0004-Enable-small-loop-unrolling-for-O2.patch
@ -0,0 +1,490 @@
+From 1070bc24f53e851cae55320e26715cc594efcd2f Mon Sep 17 00:00:00 2001
+From: Hongyu Wang <hongyu.wang@intel.com>
+Date: Thu, 8 Sep 2022 16:52:02 +0800
+Subject: [PATCH] Enable small loop unrolling for O2
+
+Modern processors has multiple way instruction decoders
+For x86, icelake/zen3 has 5 uops, so for small loop with <= 4
+instructions (usually has 3 uops with a cmp/jmp pair that can be
+macro-fused), the decoder would have 2 uops bubble for each iteration
+and the pipeline could not be fully utilized.
+
+Therefore, this patch enables loop unrolling for small size loop at O2
+to fullfill the decoder as much as possible. It turns on rtl loop
+unrolling when targetm.loop_unroll_adjust exists and O2 plus speed only.
+In x86 backend the default behavior is to unroll small loops with less
+than 4 insns by 1 time.
+
+This improves 548.exchange2 by 9% on icelake and 7.4% on zen3 with
+0.9% codesize increment. For other benchmarks the variants are minor
+and overall codesize increased by 0.2%.
+
+The kernel image size increased by 0.06%, and no impact on eembc.
+
+gcc/ChangeLog:
+
+	* common/config/i386/i386-common.cc (ix86_optimization_table):
+	Enable small loop unroll at O2 by default.
+	* config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
+	factor if -munroll-only-small-loops enabled and -funroll-loops/
+	-funroll-all-loops are disabled.
+	* config/i386/i386.h (struct processor_costs): Add 2 field
+	small_unroll_ninsns and small_unroll_factor.
+	* config/i386/i386.opt: Add -munroll-only-small-loops.
+	* doc/invoke.texi: Document -munroll-only-small-loops.
+	* loop-init.cc (pass_rtl_unroll_loops::gate): Enable rtl
+	loop unrolling for -O2-speed and above if target hook
+	loop_unroll_adjust exists.
+	(pass_rtl_unroll_loops::execute): Set UAP_UNROLL flag
+	when target hook loop_unroll_adjust exists.
+	* config/i386/x86-tune-costs.h: Update all processor costs
+	with small_unroll_ninsns = 4 and small_unroll_factor = 2.
+
+gcc/testsuite/ChangeLog:
+
+	* gcc.dg/guality/loop-1.c: Add additional option
+	-mno-unroll-only-small-loops.
+	* gcc.target/i386/pr86270.c: Add -mno-unroll-only-small-loops.
+	* gcc.target/i386/pr93002.c: Likewise.
+---
+ gcc/common/config/i386/i386-common.cc   |  1 +
+ gcc/config/i386/i386.cc                 | 18 ++++++++
+ gcc/config/i386/i386.h                  |  5 +++
+ gcc/config/i386/i386.opt                |  4 ++
+ gcc/config/i386/x86-tune-costs.h        | 58 +++++++++++++++++++++++++
+ gcc/doc/invoke.texi                     | 11 ++++-
+ gcc/loop-init.cc                        | 10 +++--
+ gcc/testsuite/gcc.dg/guality/loop-1.c   |  2 +
+ gcc/testsuite/gcc.target/i386/pr86270.c |  2 +-
+ gcc/testsuite/gcc.target/i386/pr93002.c |  2 +-
+ 10 files changed, 107 insertions(+), 6 deletions(-)
+
+diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
+index e2594cae4..cdd5caa55 100644
+--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
+@@ -1687,6 +1687,7 @@ static const struct default_options ix86_option_optimization_table[] =
+     /* The STC algorithm produces the smallest code at -Os, for x86.  */
+     { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL,
+       REORDER_BLOCKS_ALGORITHM_STC },
+    { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 },
+     /* Turn off -fschedule-insns by default.  It tends to make the
+        problem with not enough registers even worse.  */
+     { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
+diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
+index 9a9ff3b34..e56004300 100644
+--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
+@@ -23570,6 +23570,24 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop)
+   unsigned i;
+   unsigned mem_count = 0;
+ 
+  /* Unroll small size loop when unroll factor is not explicitly
+     specified.  */
+  if (!(flag_unroll_loops
+	|| flag_unroll_all_loops
+	|| loop->unroll))
+    {
+      nunroll = 1;
+
+      /* Any explicit -f{no-}unroll-{all-}loops turns off
+	 -munroll-only-small-loops.  */
+      if (ix86_unroll_only_small_loops
+	  && !OPTION_SET_P (flag_unroll_loops)
+	  && loop->ninsns <= ix86_cost->small_unroll_ninsns)
+	nunroll = ix86_cost->small_unroll_factor;
+
+      return nunroll;
+    }
+
+   if (!TARGET_ADJUST_UNROLL)
+      return nunroll;
+ 
+diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
+index fce0b3564..688aaabd3 100644
+--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
+@@ -219,6 +219,11 @@ struct processor_costs {
+   const char *const align_jump;		/* Jump alignment.  */
+   const char *const align_label;	/* Label alignment.  */
+   const char *const align_func;		/* Function alignment.  */
+
+  const unsigned small_unroll_ninsns;	/* Insn count limit for small loop
+					   to be unrolled.  */
+  const unsigned small_unroll_factor;   /* Unroll factor for small loop to
+					   be unrolled.  */
+ };
+ 
+ extern const struct processor_costs *ix86_cost;
+diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
+index a3675e515..fc1b944ac 100644
+--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
+@@ -1214,3 +1214,7 @@ Do not use GOT to access external symbols.
+ -param=x86-stlf-window-ninsns=
+ Target Joined UInteger Var(x86_stlf_window_ninsns) Init(64) Param
+ Instructions number above which STFL stall penalty can be compensated.
+
+munroll-only-small-loops
+Target Var(ix86_unroll_only_small_loops) Init(0) Save
+Enable conservative small loop unrolling.
+diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
+index f105d57ca..db4c2da34 100644
+--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
+@@ -135,6 +135,8 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
+   NULL,					/* Jump alignment.  */
+   NULL,					/* Label alignment.  */
+   NULL,					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ /* Processor costs (relative to an add) */
+@@ -244,6 +246,8 @@ struct processor_costs i386_cost = {	/* 386 specific costs */
+   "4",					/* Jump alignment.  */
+   NULL,					/* Label alignment.  */
+   "4",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs i486_memcpy[2] = {
+@@ -354,6 +358,8 @@ struct processor_costs i486_cost = {	/* 486 specific costs */
+   "16",					/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs pentium_memcpy[2] = {
+@@ -462,6 +468,8 @@ struct processor_costs pentium_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ static const
+@@ -563,6 +571,8 @@ struct processor_costs lakemont_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
+@@ -679,6 +689,8 @@ struct processor_costs pentiumpro_cost = {
+   "16:11:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs geode_memcpy[2] = {
+@@ -786,6 +798,8 @@ struct processor_costs geode_cost = {
+   NULL,					/* Jump alignment.  */
+   NULL,					/* Label alignment.  */
+   NULL,					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs k6_memcpy[2] = {
+@@ -896,6 +910,8 @@ struct processor_costs k6_cost = {
+   "32:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "32",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ /* For some reason, Athlon deals better with REP prefix (relative to loops)
+@@ -1007,6 +1023,8 @@ struct processor_costs athlon_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ /* K8 has optimized REP instruction for medium sized blocks, but for very
+@@ -1127,6 +1145,8 @@ struct processor_costs k8_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
+@@ -1255,6 +1275,8 @@ struct processor_costs amdfam10_cost = {
+   "32:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "32",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ /*  BDVER has optimized REP instruction for medium sized blocks, but for
+@@ -1376,6 +1398,8 @@ const struct processor_costs bdver_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "11",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ 
+@@ -1529,6 +1553,8 @@ struct processor_costs znver1_cost = {
+   "16",					/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ /*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
+@@ -1686,6 +1712,8 @@ struct processor_costs znver2_cost = {
+   "16",					/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ struct processor_costs znver3_cost = {
+@@ -1818,6 +1846,8 @@ struct processor_costs znver3_cost = {
+   "16",					/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ /* This table currently replicates znver3_cost table. */
+@@ -1952,6 +1982,8 @@ struct processor_costs znver4_cost = {
+   "16",					/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
+@@ -2076,6 +2108,8 @@ struct processor_costs skylake_cost = {
+   "16:11:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ /* icelake_cost should produce code tuned for Icelake family of CPUs.
+@@ -2202,6 +2236,8 @@ struct processor_costs icelake_cost = {
+   "16:11:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ /* alderlake_cost should produce code tuned for alderlake family of CPUs.  */
+@@ -2322,6 +2358,8 @@ struct processor_costs alderlake_cost = {
+   "16:11:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
+@@ -2435,6 +2473,8 @@ const struct processor_costs btver1_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "11",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs btver2_memcpy[2] = {
+@@ -2545,6 +2585,8 @@ const struct processor_costs btver2_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "11",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs pentium4_memcpy[2] = {
+@@ -2654,6 +2696,8 @@ struct processor_costs pentium4_cost = {
+   NULL,					/* Jump alignment.  */
+   NULL,					/* Label alignment.  */
+   NULL,					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs nocona_memcpy[2] = {
+@@ -2766,6 +2810,8 @@ struct processor_costs nocona_cost = {
+   NULL,					/* Jump alignment.  */
+   NULL,					/* Label alignment.  */
+   NULL,					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs atom_memcpy[2] = {
+@@ -2876,6 +2922,8 @@ struct processor_costs atom_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs slm_memcpy[2] = {
+@@ -2986,6 +3034,8 @@ struct processor_costs slm_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs tremont_memcpy[2] = {
+@@ -3110,6 +3160,8 @@ struct processor_costs tremont_cost = {
+   "16:11:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ static stringop_algs intel_memcpy[2] = {
+@@ -3220,6 +3272,8 @@ struct processor_costs intel_cost = {
+   "16:8:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ /* Generic should produce code tuned for Core-i7 (and newer chips)
+@@ -3339,6 +3393,8 @@ struct processor_costs generic_cost = {
+   "16:11:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+ /* core_cost should produce code tuned for Core familly of CPUs.  */
+@@ -3465,5 +3521,7 @@ struct processor_costs core_cost = {
+   "16:11:8",				/* Jump alignment.  */
+   "0:0:8",				/* Label alignment.  */
+   "16",					/* Func alignment.  */
+  4,					/* Small unroll limit.  */
+  2,					/* Small unroll factor.  */
+ };
+ 
+diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
+index ff8cd032f..16f4b367e 100644
+--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
+@@ -1449,7 +1449,8 @@ See RS/6000 and PowerPC Options.
+ -mgeneral-regs-only  -mcall-ms2sysv-xlogues -mrelax-cmpxchg-loop @gol
+ -mindirect-branch=@var{choice}  -mfunction-return=@var{choice} @gol
+ -mindirect-branch-register -mharden-sls=@var{choice} @gol
+--mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access}
+-mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access @gol
+-munroll-only-small-loops}
+ 
+ @emph{x86 Windows Options}
+ @gccoptlist{-mconsole  -mcygwin  -mno-cygwin  -mdll @gol
+@@ -33183,6 +33184,14 @@ treat access to protected symbols as local symbols.  The default is
+ @option{-mno-direct-extern-access} and executable compiled with
+ @option{-mdirect-extern-access} may not be binary compatible if
+ protected symbols are used in shared libraries and executable.
+
+@item -munroll-only-small-loops
+@opindex munroll-only-small-loops
+@opindex mno-unroll-only-small-loops
+Controls conservative small loop unrolling. It is default enabled by
+O2, and unrolls loop with less than 4 insns by 1 time. Explicit
+-f[no-]unroll-[all-]loops would disable this flag to avoid any
+unintended unrolling behavior that user does not want.
+ @end table
+ 
+ @node x86 Windows Options
+diff --git a/gcc/loop-init.cc b/gcc/loop-init.cc
+index 1e4f6cfd7..f1c717041 100644
+--- a/gcc/loop-init.cc
+++ b/gcc/loop-init.cc
+@@ -565,9 +565,12 @@ public:
+   {}
+ 
+   /* opt_pass methods: */
+-  virtual bool gate (function *)
+  virtual bool gate (function *fun)
+     {
+-      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll);
+      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll
+	      || (targetm.loop_unroll_adjust
+		  && optimize >= 2
+		  && optimize_function_for_speed_p (fun)));
+     }
+ 
+   virtual unsigned int execute (function *);
+@@ -583,7 +586,8 @@ pass_rtl_unroll_loops::execute (function *fun)
+       if (dump_file)
+ 	df_dump (dump_file);
+ 
+-      if (flag_unroll_loops)
+      if (flag_unroll_loops
+	  || targetm.loop_unroll_adjust)
+ 	flags |= UAP_UNROLL;
+       if (flag_unroll_all_loops)
+ 	flags |= UAP_UNROLL_ALL;
+diff --git a/gcc/testsuite/gcc.dg/guality/loop-1.c b/gcc/testsuite/gcc.dg/guality/loop-1.c
+index 1b1f6d322..a32ea445a 100644
+--- a/gcc/testsuite/gcc.dg/guality/loop-1.c
+++ b/gcc/testsuite/gcc.dg/guality/loop-1.c
+@@ -1,5 +1,7 @@
+ /* { dg-do run } */
+ /* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */
+/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */
+
+ 
+ #include "../nop.h"
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/pr86270.c b/gcc/testsuite/gcc.target/i386/pr86270.c
+index 81841ef5b..cbc9fbb04 100644
+--- a/gcc/testsuite/gcc.target/i386/pr86270.c
+++ b/gcc/testsuite/gcc.target/i386/pr86270.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
+ 
+ int *a;
+ long len;
+diff --git a/gcc/testsuite/gcc.target/i386/pr93002.c b/gcc/testsuite/gcc.target/i386/pr93002.c
+index 0248fcc00..f75a847f7 100644
+--- a/gcc/testsuite/gcc.target/i386/pr93002.c
+++ b/gcc/testsuite/gcc.target/i386/pr93002.c
+@@ -1,6 +1,6 @@
+ /* PR target/93002 */
+ /* { dg-do compile } */
+-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
+ /* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */
+ 
+ volatile int sink;
+-- 
+2.31.1
+
--- a/0005-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch
+++ b/0005-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch
@ -0,0 +1,230 @@
+From 96898a9cd8c159625848247bd2f3a09e5c12fcfa Mon Sep 17 00:00:00 2001
+From: Hongyu Wang <hongyu.wang@intel.com>
+Date: Sat, 19 Nov 2022 09:38:00 +0800
+Subject: [PATCH] i386: Only enable small loop unrolling in backend [PR
+ 107692]
+
+Followed by the discussion in pr107692, -munroll-only-small-loops
+Does not turns on/off -funroll-loops, and current check in
+pass_rtl_unroll_loops::gate would cause -fno-unroll-loops do not take
+effect. Revert the change about targetm.loop_unroll_adjust and apply
+the backend option change to strictly follow the rule that
+-funroll-loops takes full control of loop unrolling, and
+munroll-only-small-loops just change its behavior to unroll small size
+loops.
+
+gcc/ChangeLog:
+
+	PR target/107692
+	* common/config/i386/i386-common.cc (ix86_optimization_table):
+	Enable loop unroll O2, disable -fweb and -frename-registers
+	by default.
+	* config/i386/i386-options.cc
+	(ix86_override_options_after_change):
+	Disable small loop unroll when funroll-loops enabled, reset
+	cunroll_grow_size when it is not explicitly enabled.
+	(ix86_option_override_internal): Call
+	ix86_override_options_after_change instead of calling
+	ix86_recompute_optlev_based_flags and ix86_default_align
+	separately.
+	* config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
+	factor if -munroll-only-small-loops enabled.
+	* loop-init.cc (pass_rtl_unroll_loops::gate): Do not enable
+	loop unrolling for -O2-speed.
+	(pass_rtl_unroll_loops::execute): Rmove
+	targetm.loop_unroll_adjust check.
+
+gcc/testsuite/ChangeLog:
+
+	PR target/107692
+	* gcc.dg/guality/loop-1.c: Remove additional option for ia32.
+	* gcc.target/i386/pr86270.c: Add -fno-unroll-loops.
+	* gcc.target/i386/pr93002.c: Likewise.
+---
+ gcc/common/config/i386/i386-common.cc   |  8 ++++++
+ gcc/config/i386/i386-options.cc         | 34 ++++++++++++++++++++++---
+ gcc/config/i386/i386.cc                 | 18 ++++---------
+ gcc/loop-init.cc                        | 10 +++-----
+ gcc/testsuite/gcc.dg/guality/loop-1.c   |  2 --
+ gcc/testsuite/gcc.target/i386/pr86270.c |  2 +-
+ gcc/testsuite/gcc.target/i386/pr93002.c |  2 +-
+ 7 files changed, 48 insertions(+), 28 deletions(-)
+
+diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
+index cdd5caa55..f650e255f 100644
+--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
+@@ -1687,7 +1687,15 @@ static const struct default_options ix86_option_optimization_table[] =
+     /* The STC algorithm produces the smallest code at -Os, for x86.  */
+     { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL,
+       REORDER_BLOCKS_ALGORITHM_STC },
+
+    /* Turn on -funroll-loops with -munroll-only-small-loops to enable small
+       loop unrolling at -O2.  */
+    { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_funroll_loops, NULL, 1 },
+     { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 },
+    /* Turns off -frename-registers and -fweb which are enabled by
+       funroll-loops.  */
+    { OPT_LEVELS_ALL, OPT_frename_registers, NULL, 0 },
+    { OPT_LEVELS_ALL, OPT_fweb, NULL, 0 },
+     /* Turn off -fschedule-insns by default.  It tends to make the
+        problem with not enough registers even worse.  */
+     { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
+diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
+index 099cec4b6..ff44ad4e0 100644
+--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
+@@ -1816,8 +1816,37 @@ ix86_recompute_optlev_based_flags (struct gcc_options *opts,
+ void
+ ix86_override_options_after_change (void)
+ {
+  /* Default align_* from the processor table.  */
+   ix86_default_align (&global_options);
+
+   ix86_recompute_optlev_based_flags (&global_options, &global_options_set);
+
+  /* Disable unrolling small loops when there's explicit
+     -f{,no}unroll-loop.  */
+  if ((OPTION_SET_P (flag_unroll_loops))
+     || (OPTION_SET_P (flag_unroll_all_loops)
+	 && flag_unroll_all_loops))
+    {
+      if (!OPTION_SET_P (ix86_unroll_only_small_loops))
+	ix86_unroll_only_small_loops = 0;
+      /* Re-enable -frename-registers and -fweb if funroll-loops
+	 enabled.  */
+      if (!OPTION_SET_P (flag_web))
+	flag_web = flag_unroll_loops;
+      if (!OPTION_SET_P (flag_rename_registers))
+	flag_rename_registers = flag_unroll_loops;
+      /* -fcunroll-grow-size default follws -f[no]-unroll-loops.  */
+      if (!OPTION_SET_P (flag_cunroll_grow_size))
+	flag_cunroll_grow_size = flag_unroll_loops
+				 || flag_peel_loops
+				 || optimize >= 3;
+    }
+  else
+    {
+      if (!OPTION_SET_P (flag_cunroll_grow_size))
+	flag_cunroll_grow_size = flag_peel_loops || optimize >= 3;
+    }
+
+ }
+ 
+ /* Clear stack slot assignments remembered from previous functions.
+@@ -2329,7 +2358,7 @@ ix86_option_override_internal (bool main_args_p,
+ 
+   set_ix86_tune_features (opts, ix86_tune, opts->x_ix86_dump_tunes);
+ 
+-  ix86_recompute_optlev_based_flags (opts, opts_set);
+  ix86_override_options_after_change ();
+ 
+   ix86_tune_cost = processor_cost_table[ix86_tune];
+   /* TODO: ix86_cost should be chosen at instruction or function granuality
+@@ -2360,9 +2389,6 @@ ix86_option_override_internal (bool main_args_p,
+       || TARGET_64BIT_P (opts->x_ix86_isa_flags))
+     opts->x_ix86_regparm = REGPARM_MAX;
+ 
+-  /* Default align_* from the processor table.  */
+-  ix86_default_align (opts);
+-
+   /* Provide default for -mbranch-cost= value.  */
+   SET_OPTION_IF_UNSET (opts, opts_set, ix86_branch_cost,
+ 		       ix86_tune_cost->branch_cost);
+diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
+index e56004300..462dce10e 100644
+--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
+@@ -23572,20 +23572,12 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop)
+ 
+   /* Unroll small size loop when unroll factor is not explicitly
+      specified.  */
+-  if (!(flag_unroll_loops
+-	|| flag_unroll_all_loops
+-	|| loop->unroll))
+  if (ix86_unroll_only_small_loops && !loop->unroll)
+     {
+-      nunroll = 1;
+-
+-      /* Any explicit -f{no-}unroll-{all-}loops turns off
+-	 -munroll-only-small-loops.  */
+-      if (ix86_unroll_only_small_loops
+-	  && !OPTION_SET_P (flag_unroll_loops)
+-	  && loop->ninsns <= ix86_cost->small_unroll_ninsns)
+-	nunroll = ix86_cost->small_unroll_factor;
+-
+-      return nunroll;
+      if (loop->ninsns <= ix86_cost->small_unroll_ninsns)
+	return MIN (nunroll, ix86_cost->small_unroll_factor);
+      else
+	return 1;
+     }
+ 
+   if (!TARGET_ADJUST_UNROLL)
+diff --git a/gcc/loop-init.cc b/gcc/loop-init.cc
+index f1c717041..1e4f6cfd7 100644
+--- a/gcc/loop-init.cc
+++ b/gcc/loop-init.cc
+@@ -565,12 +565,9 @@ public:
+   {}
+ 
+   /* opt_pass methods: */
+-  virtual bool gate (function *fun)
+  virtual bool gate (function *)
+     {
+-      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll
+-	      || (targetm.loop_unroll_adjust
+-		  && optimize >= 2
+-		  && optimize_function_for_speed_p (fun)));
+      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll);
+     }
+ 
+   virtual unsigned int execute (function *);
+@@ -586,8 +583,7 @@ pass_rtl_unroll_loops::execute (function *fun)
+       if (dump_file)
+ 	df_dump (dump_file);
+ 
+-      if (flag_unroll_loops
+-	  || targetm.loop_unroll_adjust)
+      if (flag_unroll_loops)
+ 	flags |= UAP_UNROLL;
+       if (flag_unroll_all_loops)
+ 	flags |= UAP_UNROLL_ALL;
+diff --git a/gcc/testsuite/gcc.dg/guality/loop-1.c b/gcc/testsuite/gcc.dg/guality/loop-1.c
+index a32ea445a..1b1f6d322 100644
+--- a/gcc/testsuite/gcc.dg/guality/loop-1.c
+++ b/gcc/testsuite/gcc.dg/guality/loop-1.c
+@@ -1,7 +1,5 @@
+ /* { dg-do run } */
+ /* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */
+-/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */
+-
+ 
+ #include "../nop.h"
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/pr86270.c b/gcc/testsuite/gcc.target/i386/pr86270.c
+index cbc9fbb04..98b012caf 100644
+--- a/gcc/testsuite/gcc.target/i386/pr86270.c
+++ b/gcc/testsuite/gcc.target/i386/pr86270.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
+/* { dg-options "-O2 -fno-unroll-loops" } */
+ 
+ int *a;
+ long len;
+diff --git a/gcc/testsuite/gcc.target/i386/pr93002.c b/gcc/testsuite/gcc.target/i386/pr93002.c
+index f75a847f7..7e2d869e1 100644
+--- a/gcc/testsuite/gcc.target/i386/pr93002.c
+++ b/gcc/testsuite/gcc.target/i386/pr93002.c
+@@ -1,6 +1,6 @@
+ /* PR target/93002 */
+ /* { dg-do compile } */
+-/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
+/* { dg-options "-O2 -fno-unroll-loops" } */
+ /* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */
+ 
+ volatile int sink;
+-- 
+2.31.1
+
--- a/gcc.spec
+++ b/gcc.spec
@ -2,7 +2,7 @@
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 4
+%global gcc_release 7

 %global _unpackaged_files_terminate_build 0
 %global _performance_build 1
@ -139,6 +139,9 @@ Provides: gcc(major) = %{gcc_major}
 Patch0: 0000-Version-Set-version-to-12.3.1.patch
 Patch1: 0001-CONFIG-Regenerate-configure-file.patch
 Patch2: 0002-libquadmath-Enable-libquadmath-on-kunpeng.patch
+Patch3: 0003-Add-attribute-hot-judgement-for-INLINE_HINT_known_ho.patch
+Patch4: 0004-Enable-small-loop-unrolling-for-O2.patch
+Patch5: 0005-i386-Only-enable-small-loop-unrolling-in-backend-PR-.patch

 # On ARM EABI systems, we do want -gnueabi to be part of the
 # target triple.
@ -609,6 +612,9 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch0 -p1
 %patch1 -p1
 %patch2 -p1
+%patch3 -p1
+%patch4 -p1
+%patch5 -p1

 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE

@ -2712,6 +2718,18 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*

 %changelog
+* Fri Aug 11 2023 Hongyu Wang <hongyu.wang@intel.com> 12.3.1-7
+- Type:Sync
+- i386: Only enable small loop unrolling in backend [PR 107692].
+
+* Fri Aug 11 2023 Hongyu Wang <hongyu.wang@intel.com> 12.3.1-6
+- Type:Sync
+- Enable small loop unrolling for O2.
+
+* Fri Aug 11 2023 Cui,Lili <lili.cui@intel.com> 12.3.1-5
+- Type:Sync
+- Add attribute hot judgement for INLINE_HINT_known_hot hint.
+
 * Mon Jul 17 2023 huangxiaoquan <huangxiaoquan1@huawei.com> 12.3.1-4
 - Type:SPEC
 - DESC:Enable libquadmath on kunpeng