[Sync] Sync patch from openeuler/gcc

2024-04-11 19:37:37 +08:00 · 2024-04-11 19:37:37 +08:00 · 3ee8545fc2
commit 3ee8545fc2
parent 5db544f251
21 changed files with 17497 additions and 1 deletions
--- a/0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch
+++ b/0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch
@ -0,0 +1,194 @@
+From aa39a66f6029fe16a656d7c6339908b953fb1e04 Mon Sep 17 00:00:00 2001
+From: Diachkov Ilia WX1215920 <diachkov.ilia1@huawei-partners.com>
+Date: Thu, 22 Feb 2024 11:27:43 +0300
+Subject: [PATCH 01/18] Add insn defs and correct costs for cmlt generation
+
+---
+ gcc/config/aarch64/aarch64-simd.md  | 48 +++++++++++++++++++++++++++++
+ gcc/config/aarch64/aarch64.cc       | 15 +++++++++
+ gcc/config/aarch64/aarch64.opt      |  4 +++
+ gcc/config/aarch64/iterators.md     |  3 +-
+ gcc/config/aarch64/predicates.md    | 25 +++++++++++++++
+ gcc/testsuite/gcc.dg/combine-cmlt.c | 20 ++++++++++++
+ 6 files changed, 114 insertions(+), 1 deletion(-)
+ create mode 100755 gcc/testsuite/gcc.dg/combine-cmlt.c
+
+diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
+index ee7f0b89c..82f73805f 100644
+--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
+@@ -6454,6 +6454,54 @@
+   [(set_attr "type" "neon_compare<q>, neon_compare_zero<q>")]
+ )
+ 
+;; Use cmlt to replace vector arithmetic operations like this (SImode example):
+;; B = (((A >> 15) & 0x00010001) << 16) - ((A >> 15) & 0x00010001)
+;; TODO: maybe extend to scalar operations or other cm** instructions.
+
+(define_insn "*aarch64_cmlt_as_arith<mode>"
+  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
+	(minus:<V_INT_EQUIV>
+	  (ashift:<V_INT_EQUIV>
+	    (and:<V_INT_EQUIV>
+	      (lshiftrt:<V_INT_EQUIV>
+		(match_operand:VDQHSD 1 "register_operand" "w")
+		(match_operand:VDQHSD 2 "half_size_minus_one_operand"))
+	      (match_operand:VDQHSD 3 "cmlt_arith_mask_operand"))
+	    (match_operand:VDQHSD 4 "half_size_operand"))
+	  (and:<V_INT_EQUIV>
+	    (lshiftrt:<V_INT_EQUIV>
+	      (match_dup 1)
+	      (match_dup 2))
+	    (match_dup 3))))]
+  "TARGET_SIMD && flag_cmlt_arith"
+  "cmlt\t%<v>0.<V2ntype>, %<v>1.<V2ntype>, #0"
+  [(set_attr "type" "neon_compare_zero")]
+)
+
+;; The helper definition that allows combiner to use the previous pattern.
+
+(define_insn_and_split "*arch64_cmlt_tmp<mode>"
+  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
+	(and:<V_INT_EQUIV>
+	  (lshiftrt:<V_INT_EQUIV>
+	    (match_operand:VDQHSD 1 "register_operand" "w")
+	    (match_operand:VDQHSD 2 "half_size_minus_one_operand"))
+	  (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
+  "TARGET_SIMD && flag_cmlt_arith"
+  "#"
+  "&& reload_completed"
+  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
+	(lshiftrt:<V_INT_EQUIV>
+	  (match_operand:VDQHSD 1 "register_operand")
+	  (match_operand:VDQHSD 2 "half_size_minus_one_operand")))
+   (set (match_dup 0)
+	(and:<V_INT_EQUIV>
+	  (match_dup 0)
+	  (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
+  ""
+  [(set_attr "type" "neon_compare_zero")]
+)
+
+ (define_insn_and_split "aarch64_cm<optab>di"
+   [(set (match_operand:DI 0 "register_operand" "=w,w,r")
+ 	(neg:DI
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index a3da4ca30..04072ca25 100644
+--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
+@@ -14064,6 +14064,21 @@ cost_minus:
+ 	    return true;
+ 	  }
+ 
+	/* Detect aarch64_cmlt_as_arith instruction. Now only this pattern
+	   matches the condition. The costs of cmlt and sub instructions
+	   are comparable, so we are not increasing the cost here.  */
+	if (flag_cmlt_arith && GET_CODE (op0) == ASHIFT
+	    && GET_CODE (op1) == AND)
+	  {
+	    rtx op0_subop0 = XEXP (op0, 0);
+	    if (rtx_equal_p (op0_subop0, op1))
+	      {
+		rtx lshrt_op = XEXP (op0_subop0, 0);
+		if (GET_CODE (lshrt_op) == LSHIFTRT)
+		  return true;
+	      }
+	  }
+
+ 	/* Look for SUB (extended register).  */
+ 	if (is_a <scalar_int_mode> (mode)
+ 	    && aarch64_rtx_arith_op_extract_p (op1))
+diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
+index a64b927e9..101664c7c 100644
+--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
+@@ -262,6 +262,10 @@ Use an immediate to offset from the stack protector guard register, sp_el0.
+ This option is for use with fstack-protector-strong and not for use in
+ user-land code.
+ 
+mcmlt-arith
+Target Var(flag_cmlt_arith) Optimization Init(0)
+Use SIMD cmlt instruction to perform some arithmetic/logic calculations.
+
+ TargetVariable
+ long aarch64_stack_protector_guard_offset = 0
+ 
+diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
+index 26a840d7f..967e6b0b1 100644
+--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
+@@ -1485,7 +1485,8 @@
+ 			  (V2DI "2s")])
+ 
+ ;; Register suffix narrowed modes for VQN.
+-(define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h")
+(define_mode_attr V2ntype [(V4HI "8b") (V2SI "4h")
+			   (V8HI "16b") (V4SI "8h")
+ 			   (V2DI "4s")])
+ 
+ ;; Widened modes of vector modes.
+diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
+index c308015ac..07c14aacb 100644
+--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
+@@ -49,6 +49,31 @@
+   return CONST_INT_P (op) && IN_RANGE (INTVAL (op), 1, 3);
+ })
+ 
+(define_predicate "half_size_minus_one_operand"
+  (match_code "const_vector")
+{
+  op = unwrap_const_vec_duplicate (op);
+  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
+  return CONST_INT_P (op) && (UINTVAL (op) == size - 1);
+})
+
+(define_predicate "half_size_operand"
+  (match_code "const_vector")
+{
+  op = unwrap_const_vec_duplicate (op);
+  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
+  return CONST_INT_P (op) && (UINTVAL (op) == size);
+})
+
+(define_predicate "cmlt_arith_mask_operand"
+  (match_code "const_vector")
+{
+  op = unwrap_const_vec_duplicate (op);
+  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
+  unsigned long long mask = ((unsigned long long) 1 << size) | 1;
+  return CONST_INT_P (op) && (UINTVAL (op) == mask);
+})
+
+ (define_predicate "subreg_lowpart_operator"
+   (ior (match_code "truncate")
+        (and (match_code "subreg")
+diff --git a/gcc/testsuite/gcc.dg/combine-cmlt.c b/gcc/testsuite/gcc.dg/combine-cmlt.c
+new file mode 100755
+index 000000000..b4c9a37ff
+--- /dev/null
+++ b/gcc/testsuite/gcc.dg/combine-cmlt.c
+@@ -0,0 +1,20 @@
+/* { dg-do compile { target aarch64-*-* } } */
+/* { dg-options "-O3 -mcmlt-arith" } */
+
+/* The test checks usage of cmlt insns for arithmetic/logic calculations
+ * in foo ().  It's inspired by sources of x264 codec.  */
+
+typedef unsigned short int uint16_t;
+typedef unsigned int uint32_t;
+
+void foo( uint32_t *a, uint32_t *b)
+{
+  for (unsigned i = 0; i < 4; i++)
+    {
+      uint32_t s = ((a[i]>>((8 * sizeof(uint16_t))-1))
+		    &(((uint32_t)1<<(8 * sizeof(uint16_t)))+1))*((uint16_t)-1);
+      b[i] = (a[i]+s)^s;
+    }
+}
+
+/* { dg-final { scan-assembler-times {cmlt\t} 1 } }  */
+-- 
+2.33.0
+
--- a/0036-rtl-ifcvt-introduce-rtl-ifcvt-enchancements.patch
+++ b/0036-rtl-ifcvt-introduce-rtl-ifcvt-enchancements.patch
@ -0,0 +1,560 @@
+From 4cae948c1c00ad7a59f0f234f809fbd9a0208eb4 Mon Sep 17 00:00:00 2001
+From: vchernon <chernonog.vyacheslav@huawei.com>
+Date: Wed, 28 Feb 2024 23:05:12 +0800
+Subject: [PATCH 02/18] [rtl-ifcvt] introduce rtl ifcvt enchancements     new
+ option:       -fifcvt-allow-complicated-cmps:         allows ifcvt to deal
+ with complicated cmps like
+
+        cmp reg1 (reg2 + reg3)
+
+        can increase compilation time
+    new param:
+      -param=ifcvt-allow-register-renaming=[0,1,2]
+        1 : allows ifcvt to rename registers in then and else bb
+        2 : allows to rename registers in condition and else/then bb
+        can increase compilation time and register pressure
+---
+ gcc/common.opt                                |   4 +
+ gcc/ifcvt.cc                                  | 291 +++++++++++++++---
+ gcc/params.opt                                |   4 +
+ .../gcc.c-torture/execute/ifcvt-renaming-1.c  |  35 +++
+ gcc/testsuite/gcc.dg/ifcvt-6.c                |  27 ++
+ 5 files changed, 311 insertions(+), 50 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c
+ create mode 100644 gcc/testsuite/gcc.dg/ifcvt-6.c
+
+diff --git a/gcc/common.opt b/gcc/common.opt
+index c7c6bc256..aa00fb7b0 100644
+--- a/gcc/common.opt
+++ b/gcc/common.opt
+@@ -3691,4 +3691,8 @@ fipa-ra
+ Common Var(flag_ipa_ra) Optimization
+ Use caller save register across calls if possible.
+ 
+fifcvt-allow-complicated-cmps
+Common Var(flag_ifcvt_allow_complicated_cmps) Optimization
+Allow RTL if-conversion pass to deal with complicated cmps (can increase compilation time).
+
+ ; This comment is to ensure we retain the blank line above.
+diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc
+index 2c1eba312..584db7b55 100644
+--- a/gcc/ifcvt.cc
+++ b/gcc/ifcvt.cc
+@@ -886,7 +886,9 @@ noce_emit_store_flag (struct noce_if_info *if_info, rtx x, int reversep,
+     }
+ 
+   /* Don't even try if the comparison operands or the mode of X are weird.  */
+-  if (cond_complex || !SCALAR_INT_MODE_P (GET_MODE (x)))
+  if (!flag_ifcvt_allow_complicated_cmps
+      && (cond_complex
+	  || !SCALAR_INT_MODE_P (GET_MODE (x))))
+     return NULL_RTX;
+ 
+   return emit_store_flag (x, code, XEXP (cond, 0),
+@@ -1965,7 +1967,8 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc)
+   /* Currently support only simple single sets in test_bb.  */
+   if (!sset
+       || !noce_operand_ok (SET_DEST (sset))
+-      || contains_ccmode_rtx_p (SET_DEST (sset))
+      || (!flag_ifcvt_allow_complicated_cmps
+	  && contains_ccmode_rtx_p (SET_DEST (sset)))
+       || !noce_operand_ok (SET_SRC (sset)))
+     return false;
+ 
+@@ -1979,13 +1982,17 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc)
+    in this function.  */
+ 
+ static bool
+-bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
+bbs_ok_for_cmove_arith (basic_block bb_a,
+			basic_block bb_b,
+			rtx to_rename,
+			bitmap conflict_regs)
+ {
+   rtx_insn *a_insn;
+   bitmap bba_sets = BITMAP_ALLOC (&reg_obstack);
+-
+  bitmap intersections = BITMAP_ALLOC (&reg_obstack);
+   df_ref def;
+   df_ref use;
+  rtx_insn *last_a = last_active_insn (bb_a, FALSE);
+ 
+   FOR_BB_INSNS (bb_a, a_insn)
+     {
+@@ -1995,18 +2002,15 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
+       rtx sset_a = single_set (a_insn);
+ 
+       if (!sset_a)
+-	{
+-	  BITMAP_FREE (bba_sets);
+-	  return false;
+-	}
+	goto end_cmove_arith_check_and_fail;
+       /* Record all registers that BB_A sets.  */
+       FOR_EACH_INSN_DEF (def, a_insn)
+-	if (!(to_rename && DF_REF_REG (def) == to_rename))
+	if (!(to_rename && DF_REF_REG (def) == to_rename && a_insn == last_a))
+ 	  bitmap_set_bit (bba_sets, DF_REF_REGNO (def));
+     }
+ 
+  bitmap_and (intersections, df_get_live_in (bb_b), bba_sets);
+   rtx_insn *b_insn;
+-
+   FOR_BB_INSNS (bb_b, b_insn)
+     {
+       if (!active_insn_p (b_insn))
+@@ -2015,10 +2019,7 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
+       rtx sset_b = single_set (b_insn);
+ 
+       if (!sset_b)
+-	{
+-	  BITMAP_FREE (bba_sets);
+-	  return false;
+-	}
+	goto end_cmove_arith_check_and_fail;
+ 
+       /* Make sure this is a REG and not some instance
+ 	 of ZERO_EXTRACT or SUBREG or other dangerous stuff.
+@@ -2030,25 +2031,34 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
+       if (MEM_P (SET_DEST (sset_b)))
+ 	gcc_assert (rtx_equal_p (SET_DEST (sset_b), to_rename));
+       else if (!REG_P (SET_DEST (sset_b)))
+-	{
+-	  BITMAP_FREE (bba_sets);
+-	  return false;
+-	}
+	goto end_cmove_arith_check_and_fail;
+ 
+-      /* If the insn uses a reg set in BB_A return false.  */
+      /* If the insn uses a reg set in BB_A return false
+	 or try to collect register list for renaming.  */
+       FOR_EACH_INSN_USE (use, b_insn)
+ 	{
+-	  if (bitmap_bit_p (bba_sets, DF_REF_REGNO (use)))
+	  if (bitmap_bit_p (intersections, DF_REF_REGNO (use)))
+ 	    {
+-	      BITMAP_FREE (bba_sets);
+-	      return false;
+	      if (param_ifcvt_allow_register_renaming < 1)
+		  goto end_cmove_arith_check_and_fail;
+
+	      /* Those regs should be renamed.  We can't rename CC reg, but
+		 possibly we can provide combined comparison in the future.  */
+	      if (GET_MODE_CLASS (GET_MODE (DF_REF_REG (use))) == MODE_CC)
+		goto end_cmove_arith_check_and_fail;
+	      bitmap_set_bit (conflict_regs, DF_REF_REGNO (use));
+ 	    }
+ 	}
+-
+     }
+ 
+   BITMAP_FREE (bba_sets);
+  BITMAP_FREE (intersections);
+   return true;
+
+end_cmove_arith_check_and_fail:
+  BITMAP_FREE (bba_sets);
+  BITMAP_FREE (intersections);
+  return false;
+ }
+ 
+ /* Emit copies of all the active instructions in BB except the last.
+@@ -2103,6 +2113,142 @@ noce_emit_bb (rtx last_insn, basic_block bb, bool simple)
+   return true;
+ }
+ 
+/* This function tries to rename regs that intersect with considered bb
+   inside condition expression.  Condition expression will be moved down
+   if the optimization will be applied, so it is essential to be sure that
+   all intersected registers will be renamed otherwise transformation
+   can't be applied.  Function returns true if renaming was successful
+   and optimization can proceed futher.  */
+
+static bool
+noce_rename_regs_in_cond (struct noce_if_info *if_info, bitmap cond_rename_regs)
+{
+  bool success = true;
+  if (bitmap_empty_p (cond_rename_regs))
+    return true;
+  if (param_ifcvt_allow_register_renaming < 2)
+    return false;
+  df_ref use;
+  rtx_insn *cmp_insn = if_info->cond_earliest;
+  /*  Jump instruction as a condion currently unsupported.  */
+  if (JUMP_P (cmp_insn))
+    return false;
+  rtx_insn *before_cmp = PREV_INSN (cmp_insn);
+  start_sequence ();
+  rtx_insn *copy_of_cmp = as_a <rtx_insn *> (copy_rtx (cmp_insn));
+  basic_block cmp_block = BLOCK_FOR_INSN (cmp_insn);
+  FOR_EACH_INSN_USE (use, cmp_insn)
+    {
+      if (bitmap_bit_p (cond_rename_regs, DF_REF_REGNO (use)))
+	{
+	  rtx use_reg = DF_REF_REG (use);
+	  rtx tmp = gen_reg_rtx (GET_MODE (use_reg));
+	  if (!validate_replace_rtx (use_reg, tmp, copy_of_cmp))
+	    {
+	      end_sequence ();
+	      return false;
+	    }
+	  noce_emit_move_insn (tmp, use_reg);
+	}
+    }
+
+  emit_insn (PATTERN (copy_of_cmp));
+  rtx_insn *seq = get_insns ();
+  unshare_all_rtl_in_chain (seq);
+  end_sequence ();
+
+  emit_insn_after_setloc (seq, before_cmp, INSN_LOCATION (cmp_insn));
+  delete_insn_and_edges (cmp_insn);
+  rtx_insn *insn;
+  FOR_BB_INSNS (cmp_block, insn)
+    df_insn_rescan (insn);
+
+  if_info->cond = noce_get_condition (if_info->jump,
+				      &copy_of_cmp,
+				      if_info->then_else_reversed);
+  if_info->cond_earliest = copy_of_cmp;
+  if_info->rev_cond = NULL_RTX;
+
+  return success;
+}
+
+/* This function tries to rename regs that intersect with considered bb.
+   return true if the renaming was successful and optimization can
+   proceed futher, false otherwise.  */
+static bool
+noce_rename_regs_in_bb (basic_block test_bb, bitmap rename_regs)
+{
+  if (bitmap_empty_p (rename_regs))
+    return true;
+  rtx_insn *insn;
+  rtx_insn *last_insn = last_active_insn (test_bb, FALSE);
+  bool res = true;
+  start_sequence ();
+  FOR_BB_INSNS (test_bb, insn)
+    {
+      if (!active_insn_p (insn))
+	continue;
+      /* Only ssets are supported for now.  */
+      rtx sset = single_set (insn);
+      gcc_assert (sset);
+      rtx x = SET_DEST (sset);
+      if (!REG_P (x) || !bitmap_bit_p (rename_regs, REGNO (x)))
+	continue;
+      /* Do not need to rename dest in the last instruction
+	 it will be renamed anyway.  */
+      if (insn == last_insn)
+	continue;
+      machine_mode mode = GET_MODE (x);
+      rtx tmp = gen_reg_rtx (mode);
+      if (!validate_replace_rtx_part (x, tmp, &SET_DEST (sset), insn))
+	{
+	  gcc_assert (insn != last_insn);
+	  /* We can generate additional move for such case,
+	     but it will increase register preasure.
+	     For now just stop transformation.  */
+	  rtx result_rtx = SET_DEST (single_set (last_insn));
+	  if (REG_P (result_rtx) && (x != result_rtx))
+	    {
+	      res = false;
+	      break;
+	    }
+	  if (!validate_replace_rtx (x, tmp, insn))
+	    gcc_unreachable ();
+	  noce_emit_move_insn (tmp,x);
+	}
+      set_used_flags (insn);
+      rtx_insn *rename_candidate;
+      for (rename_candidate = NEXT_INSN (insn);
+	   rename_candidate && rename_candidate!= NEXT_INSN (BB_END (test_bb));
+	   rename_candidate = NEXT_INSN (rename_candidate))
+	{
+	  if (!reg_overlap_mentioned_p (x, rename_candidate))
+	    continue;
+
+	  int replace_res = TRUE;
+	  if (rename_candidate == last_insn)
+	    {
+	      validate_replace_src_group (x, tmp, rename_candidate);
+	      replace_res = apply_change_group ();
+	    }
+	  else
+	    replace_res = validate_replace_rtx (x, tmp, rename_candidate);
+	  gcc_assert (replace_res);
+	  set_used_flags (rename_candidate);
+	}
+      set_used_flags (x);
+      set_used_flags (tmp);
+    }
+    rtx_insn *seq = get_insns ();
+    unshare_all_rtl_in_chain (seq);
+    end_sequence ();
+    emit_insn_before_setloc (seq, first_active_insn (test_bb),
+			     INSN_LOCATION (first_active_insn (test_bb)));
+    FOR_BB_INSNS (test_bb, insn)
+      df_insn_rescan (insn);
+  return res;
+}
+
+ /* Try more complex cases involving conditional_move.  */
+ 
+ static int
+@@ -2185,11 +2331,30 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
+ 	  std::swap (then_bb, else_bb);
+ 	}
+     }
+-
+  bitmap else_bb_rename_regs = BITMAP_ALLOC (&reg_obstack);
+  bitmap then_bb_rename_regs = BITMAP_ALLOC (&reg_obstack);
+   if (then_bb && else_bb
+-      && (!bbs_ok_for_cmove_arith (then_bb, else_bb,  if_info->orig_x)
+-	  || !bbs_ok_for_cmove_arith (else_bb, then_bb,  if_info->orig_x)))
+-    return FALSE;
+      && (!bbs_ok_for_cmove_arith (then_bb, else_bb,
+				   if_info->orig_x,
+				   then_bb_rename_regs)
+	  || !bbs_ok_for_cmove_arith (else_bb, then_bb,
+				      if_info->orig_x,
+				      else_bb_rename_regs)))
+    {
+      BITMAP_FREE (then_bb_rename_regs);
+      BITMAP_FREE (else_bb_rename_regs);
+      return FALSE;
+    }
+  bool prepass_renaming = noce_rename_regs_in_bb (then_bb,
+						  then_bb_rename_regs)
+			  && noce_rename_regs_in_bb (else_bb,
+						     else_bb_rename_regs);
+
+  BITMAP_FREE (then_bb_rename_regs);
+  BITMAP_FREE (else_bb_rename_regs);
+
+  if (!prepass_renaming)
+   return FALSE;
+ 
+   start_sequence ();
+ 
+@@ -3072,7 +3237,8 @@ noce_operand_ok (const_rtx op)
+ 
+ static bool
+ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
+-			      unsigned int *cost, bool *simple_p)
+			     unsigned int *cost, bool *simple_p,
+			     bitmap cond_rename_regs)
+ {
+   if (!test_bb)
+     return false;
+@@ -3112,8 +3278,9 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
+   rtx_insn *prev_last_insn = PREV_INSN (last_insn);
+   gcc_assert (prev_last_insn);
+ 
+-  /* For now, disallow setting x multiple times in test_bb.  */
+-  if (REG_P (x) && reg_set_between_p (x, first_insn, prev_last_insn))
+  if (REG_P (x)
+      && reg_set_between_p (x, first_insn, prev_last_insn)
+      && param_ifcvt_allow_register_renaming < 1)
+     return false;
+ 
+   bitmap test_bb_temps = BITMAP_ALLOC (&reg_obstack);
+@@ -3125,25 +3292,35 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
+   rtx_insn *insn;
+   FOR_BB_INSNS (test_bb, insn)
+     {
+-      if (insn != last_insn)
+-	{
+-	  if (!active_insn_p (insn))
+-	    continue;
+      if (insn == last_insn)
+	continue;
+      if (!active_insn_p (insn))
+	continue;
+ 
+-	  if (!insn_valid_noce_process_p (insn, cc))
+-	    goto free_bitmap_and_fail;
+      if (!insn_valid_noce_process_p (insn, cc))
+	goto free_bitmap_and_fail;
+ 
+-	  rtx sset = single_set (insn);
+-	  gcc_assert (sset);
+      rtx sset = single_set (insn);
+      gcc_assert (sset);
+ 
+-	  if (contains_mem_rtx_p (SET_SRC (sset))
+-	      || !REG_P (SET_DEST (sset))
+-	      || reg_overlap_mentioned_p (SET_DEST (sset), cond))
+-	    goto free_bitmap_and_fail;
+      if (contains_mem_rtx_p (SET_SRC (sset))
+	  || !REG_P (SET_DEST (sset)))
+	goto free_bitmap_and_fail;
+ 
+-	  potential_cost += pattern_cost (sset, speed_p);
+-	  bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset)));
+      if (reg_overlap_mentioned_p (SET_DEST (sset), cond))
+	{
+	  if (param_ifcvt_allow_register_renaming < 1)
+	    goto free_bitmap_and_fail;
+	  rtx sset_dest = SET_DEST (sset);
+	  if (REG_P (sset_dest)
+	      && (GET_MODE_CLASS (GET_MODE (sset_dest)) != MODE_CC))
+	    bitmap_set_bit (cond_rename_regs, REGNO (sset_dest));
+	  else
+	    goto free_bitmap_and_fail;
+ 	}
+	potential_cost += pattern_cost (sset, speed_p);
+	if (SET_DEST (sset) != SET_DEST (last_set))
+	  bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset)));
+     }
+ 
+   /* If any of the intermediate results in test_bb are live after test_bb
+@@ -3777,15 +3954,29 @@ noce_process_if_block (struct noce_if_info *if_info)
+ 
+   bool speed_p = optimize_bb_for_speed_p (test_bb);
+   unsigned int then_cost = 0, else_cost = 0;
+  bitmap cond_rename_regs = BITMAP_ALLOC (&reg_obstack);
+   if (!bb_valid_for_noce_process_p (then_bb, cond, &then_cost,
+-				    &if_info->then_simple))
+-    return false;
+				    &if_info->then_simple, cond_rename_regs))
+    {
+      BITMAP_FREE (cond_rename_regs);
+      return false;
+    }
+ 
+   if (else_bb
+       && !bb_valid_for_noce_process_p (else_bb, cond, &else_cost,
+-				       &if_info->else_simple))
+-    return false;
+				       &if_info->else_simple, cond_rename_regs))
+    {
+      BITMAP_FREE (cond_rename_regs);
+      return false;
+    }
+ 
+  if (!noce_rename_regs_in_cond (if_info, cond_rename_regs))
+    {
+      BITMAP_FREE (cond_rename_regs);
+      return false;
+    }
+  BITMAP_FREE (cond_rename_regs);
+  cond = if_info->cond;
+   if (speed_p)
+     if_info->original_cost += average_cost (then_cost, else_cost,
+ 					    find_edge (test_bb, then_bb));
+@@ -5823,12 +6014,13 @@ if_convert (bool after_combine)
+ {
+   basic_block bb;
+   int pass;
+-
+   if (optimize == 1)
+     {
+       df_live_add_problem ();
+       df_live_set_all_dirty ();
+     }
+  free_dominance_info (CDI_DOMINATORS);
+  cleanup_cfg (CLEANUP_EXPENSIVE);
+ 
+   /* Record whether we are after combine pass.  */
+   ifcvt_after_combine = after_combine;
+@@ -5933,7 +6125,6 @@ rest_of_handle_if_conversion (void)
+ 	  dump_reg_info (dump_file);
+ 	  dump_flow_info (dump_file, dump_flags);
+ 	}
+-      cleanup_cfg (CLEANUP_EXPENSIVE);
+       if_convert (false);
+       if (num_updated_if_blocks)
+ 	/* Get rid of any dead CC-related instructions.  */
+diff --git a/gcc/params.opt b/gcc/params.opt
+index d2196dc68..ba87f820b 100644
+--- a/gcc/params.opt
+++ b/gcc/params.opt
+@@ -669,6 +669,10 @@ Maximum permissible cost for the sequence that would be generated by the RTL if-
+ Common Joined UInteger Var(param_max_rtl_if_conversion_unpredictable_cost) Init(40) IntegerRange(0, 200) Param Optimization
+ Maximum permissible cost for the sequence that would be generated by the RTL if-conversion pass for a branch that is considered unpredictable.
+ 
+-param=ifcvt-allow-register-renaming=
+Common Joined UInteger Var(param_ifcvt_allow_register_renaming) IntegerRange(0, 2) Param Optimization
+Allow RTL if-conversion pass to aggressively rename registers in basic blocks.  Sometimes additional moves will be created.
+
+ -param=max-sched-extend-regions-iters=
+ Common Joined UInteger Var(param_max_sched_extend_regions_iters) Param Optimization
+ The maximum number of iterations through CFG to extend regions.
+diff --git a/gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c b/gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c
+new file mode 100644
+index 000000000..65c4d4140
+--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c
+@@ -0,0 +1,35 @@
+
+extern void abort(void);
+
+__attribute__ ((noinline))
+int foo (int x, int y, int z, int a, int b)
+{
+  if (a < 2) {
+      if (a == 0) {
+	  if (x - y < 0)
+	    x = x - y + z;
+	  else
+	    x = x - y;
+	}
+      else {
+	  if (x + y >= z)
+	    x = x + y - z;
+	  else
+	    x = x + y;
+	}
+    }
+  return x;
+}
+
+int main(void) {
+  if (foo (5,10,7,0,1) != 2) // x - y + z = -5 + 7 = 2
+    abort ();
+  if (foo (50,10,7,0,1) != 40) // x - y = 40
+    abort ();
+  if (foo (5,10,7,1,1) != 8) // x + y - z = 5 + 10 - 7 = 8
+    abort ();
+  if (foo (5,10,70,1,1) != 15) // x + y = 15
+    abort ();
+  return 0;
+}
+
+diff --git a/gcc/testsuite/gcc.dg/ifcvt-6.c b/gcc/testsuite/gcc.dg/ifcvt-6.c
+new file mode 100644
+index 000000000..be9a67b3f
+--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ifcvt-6.c
+@@ -0,0 +1,27 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-options "-fdump-rtl-ce1 -O2 --param max-rtl-if-conversion-unpredictable-cost=100 --param max-rtl-if-conversion-predictable-cost=100 --param=ifcvt-allow-register-renaming=2 -fifcvt-allow-complicated-cmps" } */
+
+typedef unsigned int uint16_t;
+
+uint16_t
+foo (uint16_t x, uint16_t y, uint16_t z, uint16_t a,
+     uint16_t b, uint16_t c, uint16_t d) {
+  int i = 1;
+  int j = 1;
+  if (a > b) {
+      j = x;
+      if (b > c)
+	i = y;
+      else
+	i = z;
+    }
+  else {
+      j = y;
+      if (c > d)
+	i = z;
+    }
+  return i * j;
+}
+
+/* { dg-final { scan-rtl-dump "7 true changes made" "ce1" } } */
+
+-- 
+2.33.0
+
--- a/0037-Perform-early-if-conversion-of-simple-arithmetic.patch
+++ b/0037-Perform-early-if-conversion-of-simple-arithmetic.patch
@ -0,0 +1,109 @@
+From 310eade1450995b55d9f8120561022fbf164b2ec Mon Sep 17 00:00:00 2001
+From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
+Date: Thu, 12 Jan 2023 14:52:49 +0300
+Subject: [PATCH 03/18] Perform early if-conversion of simple arithmetic
+
+---
+ gcc/common.opt                      |  4 ++++
+ gcc/match.pd                        | 25 +++++++++++++++++++
+ gcc/testsuite/gcc.dg/ifcvt-gimple.c | 37 +++++++++++++++++++++++++++++
+ 3 files changed, 66 insertions(+)
+ create mode 100644 gcc/testsuite/gcc.dg/ifcvt-gimple.c
+
+diff --git a/gcc/common.opt b/gcc/common.opt
+index aa00fb7b0..dac477c04 100644
+--- a/gcc/common.opt
+++ b/gcc/common.opt
+@@ -1821,6 +1821,10 @@ fif-conversion2
+ Common Var(flag_if_conversion2) Optimization
+ Perform conversion of conditional jumps to conditional execution.
+ 
+fif-conversion-gimple
+Common Var(flag_if_conversion_gimple) Optimization
+Perform conversion of conditional jumps to branchless equivalents during gimple transformations.
+
+ fstack-reuse=
+ Common Joined RejectNegative Enum(stack_reuse_level) Var(flag_stack_reuse) Init(SR_ALL) Optimization
+ -fstack-reuse=[all|named_vars|none]	Set stack reuse level for local variables.
+diff --git a/gcc/match.pd b/gcc/match.pd
+index 6f24d5079..3cbaf2a5b 100644
+--- a/gcc/match.pd
+++ b/gcc/match.pd
+@@ -4278,6 +4278,31 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
+   )
+  )
+ )
+
+(if (flag_if_conversion_gimple)
+ (for simple_op (plus minus bit_and bit_ior bit_xor)
+  (simplify
+   (cond @0 (simple_op @1 INTEGER_CST@2) @1)
+   (switch
+    /* a = cond ? a + 1 : a -> a = a + ((int) cond) */
+    (if (integer_onep (@2))
+     (simple_op @1 (convert (convert:boolean_type_node @0))))
+    /* a = cond ? a + powerof2cst : a ->
+       a = a + ((int) cond) << log2 (powerof2cst) */
+    (if (INTEGRAL_TYPE_P (type) && integer_pow2p (@2))
+     (with
+      {
+	tree shift = build_int_cst (integer_type_node, tree_log2 (@2));
+      }
+      (simple_op @1 (lshift (convert (convert:boolean_type_node @0))
+			    { shift; })
+      )
+     )
+    )
+   )
+  )
+ )
+)
+ #endif
+ 
+ #if GIMPLE
+diff --git a/gcc/testsuite/gcc.dg/ifcvt-gimple.c b/gcc/testsuite/gcc.dg/ifcvt-gimple.c
+new file mode 100644
+index 000000000..0f7c87e5c
+--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ifcvt-gimple.c
+@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fif-conversion-gimple -fdump-tree-optimized" } */
+
+int test_int (int optimizable_int) {
+    if (optimizable_int > 5)
+	++optimizable_int;
+    return optimizable_int;
+}
+
+int test_int_pow2 (int optimizable_int_pow2) {
+    if (optimizable_int_pow2 <= 4)
+	optimizable_int_pow2 += 1024;
+    return optimizable_int_pow2;
+}
+
+int test_int_non_pow2 (int not_optimizable_int_non_pow2) {
+    if (not_optimizable_int_non_pow2 == 1)
+	not_optimizable_int_non_pow2 += 513;
+    return not_optimizable_int_non_pow2;
+}
+
+float test_float (float not_optimizable_float) {
+    if (not_optimizable_float > 5)
+	not_optimizable_float += 1;
+    return not_optimizable_float;
+}
+
+/* Expecting if-else block in test_float and test_int_non_pow2 only. */
+/* { dg-final { scan-tree-dump-not "if \\(optimizable" "optimized" } } */
+/* { dg-final { scan-tree-dump "if \\(not_optimizable_int_non_pow2" "optimized" } } */
+/* { dg-final { scan-tree-dump "if \\(not_optimizable_float" "optimized" } } */
+/* { dg-final { scan-tree-dump-times "if " 2 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "else" 2 "optimized" } } */
+
+/* Expecting shifted result only for optimizable_int_pow2. */
+/* { dg-final { scan-tree-dump-times " << " 1 "optimized" } } */
+/* { dg-final { scan-tree-dump " << 10;" "optimized" } } */
+-- 
+2.33.0
+
--- a/0038-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch
+++ b/0038-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch
@ -0,0 +1,252 @@
+From 6684509e81e4341675c73a7dc853180229a8abcb Mon Sep 17 00:00:00 2001
+From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
+Date: Tue, 24 Jan 2023 16:43:40 +0300
+Subject: [PATCH 04/18] Add option to allow matching uaddsub overflow for widen
+ ops too.
+
+---
+ gcc/common.opt                 |   5 ++
+ gcc/testsuite/gcc.dg/uaddsub.c | 143 +++++++++++++++++++++++++++++++++
+ gcc/tree-ssa-math-opts.cc      |  43 ++++++++--
+ 3 files changed, 184 insertions(+), 7 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.dg/uaddsub.c
+
+diff --git a/gcc/common.opt b/gcc/common.opt
+index dac477c04..39c90604e 100644
+--- a/gcc/common.opt
+++ b/gcc/common.opt
+@@ -3106,6 +3106,11 @@ freciprocal-math
+ Common Var(flag_reciprocal_math) SetByCombined Optimization
+ Same as -fassociative-math for expressions which include division.
+ 
+fuaddsub-overflow-match-all
+Common Var(flag_uaddsub_overflow_match_all)
+Match unsigned add/sub overflow even if the target does not support
+the corresponding instruction.
+
+ ; Nonzero means that unsafe floating-point math optimizations are allowed
+ ; for the sake of speed.  IEEE compliance is not guaranteed, and operations
+ ; are allowed to assume that their arguments and results are "normal"
+diff --git a/gcc/testsuite/gcc.dg/uaddsub.c b/gcc/testsuite/gcc.dg/uaddsub.c
+new file mode 100644
+index 000000000..96c26d308
+--- /dev/null
+++ b/gcc/testsuite/gcc.dg/uaddsub.c
+@@ -0,0 +1,143 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fuaddsub-overflow-match-all -fdump-tree-optimized" } */
+#include <stdint.h>
+
+typedef unsigned __int128 uint128_t;
+typedef struct uint256_t
+{
+  uint128_t lo;
+  uint128_t hi;
+} uint256_t;
+
+uint16_t add16 (uint8_t a, uint8_t b)
+{
+  uint8_t tmp = a + b;
+  uint8_t overflow = 0;
+  if (tmp < a)
+    overflow = 1;
+
+  uint16_t res = overflow;
+  res <<= 8;
+  res += tmp;
+  return res;
+}
+
+uint32_t add32 (uint16_t a, uint16_t b)
+{
+  uint16_t tmp = a + b;
+  uint16_t overflow = 0;
+  if (tmp < a)
+    overflow = 1;
+
+  uint32_t res = overflow;
+  res <<= 16;
+  res += tmp;
+  return res;
+}
+
+uint64_t add64 (uint32_t a, uint32_t b)
+{
+  uint32_t tmp = a + b;
+  uint32_t overflow = 0;
+  if (tmp < a)
+    overflow = 1;
+
+  uint64_t res = overflow;
+  res <<= 32;
+  res += tmp;
+  return res;
+}
+
+uint128_t add128 (uint64_t a, uint64_t b)
+{
+  uint64_t tmp = a + b;
+  uint64_t overflow = 0;
+  if (tmp < a)
+    overflow = 1;
+
+  uint128_t res = overflow;
+  res <<= 64;
+  res += tmp;
+  return res;
+}
+
+uint256_t add256 (uint128_t a, uint128_t b)
+{
+  uint128_t tmp = a + b;
+  uint128_t overflow = 0;
+  if (tmp < a)
+    overflow = 1;
+
+  uint256_t res;
+  res.hi = overflow;
+  res.lo = tmp;
+  return res;
+}
+
+uint16_t sub16 (uint8_t a, uint8_t b)
+{
+  uint8_t tmp = a - b;
+  uint8_t overflow = 0;
+  if (tmp > a)
+    overflow = -1;
+
+  uint16_t res = overflow;
+  res <<= 8;
+  res += tmp;
+  return res;
+}
+
+uint32_t sub32 (uint16_t a, uint16_t b)
+{
+  uint16_t tmp = a - b;
+  uint16_t overflow = 0;
+  if (tmp > a)
+    overflow = -1;
+
+  uint32_t res = overflow;
+  res <<= 16;
+  res += tmp;
+  return res;
+}
+
+uint64_t sub64 (uint32_t a, uint32_t b)
+{
+  uint32_t tmp = a - b;
+  uint32_t overflow = 0;
+  if (tmp > a)
+    overflow = -1;
+
+  uint64_t res = overflow;
+  res <<= 32;
+  res += tmp;
+  return res;
+}
+
+uint128_t sub128 (uint64_t a, uint64_t b)
+{
+  uint64_t tmp = a - b;
+  uint64_t overflow = 0;
+  if (tmp > a)
+    overflow = -1;
+
+  uint128_t res = overflow;
+  res <<= 64;
+  res += tmp;
+  return res;
+}
+
+uint256_t sub256 (uint128_t a, uint128_t b)
+{
+  uint128_t tmp = a - b;
+  uint128_t overflow = 0;
+  if (tmp > a)
+    overflow = -1;
+
+  uint256_t res;
+  res.hi = overflow;
+  res.lo = tmp;
+  return res;
+}
+
+/* { dg-final { scan-tree-dump-times "= .ADD_OVERFLOW \\(a_\[0-9\]+\\(D\\), b_\[0-9\]+\\(D\\)\\)" 5 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "= .SUB_OVERFLOW \\(a_\[0-9\]+\\(D\\), b_\[0-9\]+\\(D\\)\\)" 5 "optimized" } } */
+diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
+index 232e903b0..55d6ee8ae 100644
+--- a/gcc/tree-ssa-math-opts.cc
+++ b/gcc/tree-ssa-math-opts.cc
+@@ -3468,6 +3468,27 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
+     }
+ }
+ 
+/* Check if the corresponding operation has wider equivalent on the target.  */
+
+static bool
+wider_optab_check_p (optab op, machine_mode mode, int unsignedp)
+{
+  machine_mode wider_mode;
+  FOR_EACH_WIDER_MODE (wider_mode, mode)
+    {
+      machine_mode next_mode;
+      if (optab_handler (op, wider_mode) != CODE_FOR_nothing
+	  || (op == smul_optab
+	      && GET_MODE_WIDER_MODE (wider_mode).exists (&next_mode)
+	      && (find_widening_optab_handler ((unsignedp
+						? umul_widen_optab
+						: smul_widen_optab),
+						next_mode, mode))))
+	return true;
+    }
+
+  return false;
+}
+ 
+ /* Helper function of match_arith_overflow.  For MUL_OVERFLOW, if we have
+    a check for non-zero like:
+@@ -3903,15 +3924,22 @@ match_arith_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
+ 		       || code == MINUS_EXPR
+ 		       || code == MULT_EXPR
+ 		       || code == BIT_NOT_EXPR);
+  int unsignedp = TYPE_UNSIGNED (type);
+   if (!INTEGRAL_TYPE_P (type)
+-      || !TYPE_UNSIGNED (type)
+-      || has_zero_uses (lhs)
+-      || (code != PLUS_EXPR
+-	  && code != MULT_EXPR
+-	  && optab_handler (code == MINUS_EXPR ? usubv4_optab : uaddv4_optab,
+-			    TYPE_MODE (type)) == CODE_FOR_nothing))
+      || !unsignedp
+      || has_zero_uses (lhs))
+     return false;
+ 
+  if (code == PLUS_EXPR || code == MINUS_EXPR)
+    {
+      machine_mode mode = TYPE_MODE (type);
+      optab op = code == PLUS_EXPR ? uaddv4_optab : usubv4_optab;
+      if (optab_handler (op, mode) == CODE_FOR_nothing
+	  && (!flag_uaddsub_overflow_match_all
+	      || !wider_optab_check_p (op, mode, unsignedp)))
+	return false;
+    }
+
+   tree rhs1 = gimple_assign_rhs1 (stmt);
+   tree rhs2 = gimple_assign_rhs2 (stmt);
+   FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
+@@ -3986,7 +4014,8 @@ match_arith_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
+       || (code != MULT_EXPR && (code == BIT_NOT_EXPR ? use_seen : !use_seen))
+       || (code == PLUS_EXPR
+ 	  && optab_handler (uaddv4_optab,
+-			    TYPE_MODE (type)) == CODE_FOR_nothing)
+			    TYPE_MODE (type)) == CODE_FOR_nothing
+	  && !flag_uaddsub_overflow_match_all)
+       || (code == MULT_EXPR
+ 	  && optab_handler (cast_stmt ? mulv4_optab : umulv4_optab,
+ 			    TYPE_MODE (type)) == CODE_FOR_nothing))
+-- 
+2.33.0
+
--- a/0039-Match-double-sized-mul-pattern.patch
+++ b/0039-Match-double-sized-mul-pattern.patch
@ -0,0 +1,488 @@
+From e7b22f97f960b62e555dfd6f2e3ae43973fcbb3e Mon Sep 17 00:00:00 2001
+From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
+Date: Wed, 25 Jan 2023 15:04:07 +0300
+Subject: [PATCH 05/18] Match double sized mul pattern
+
+---
+ gcc/match.pd                              | 136 +++++++++++++++++++++
+ gcc/testsuite/gcc.dg/double_sized_mul-1.c | 141 ++++++++++++++++++++++
+ gcc/testsuite/gcc.dg/double_sized_mul-2.c |  62 ++++++++++
+ gcc/tree-ssa-math-opts.cc                 |  80 ++++++++++++
+ 4 files changed, 419 insertions(+)
+ create mode 100644 gcc/testsuite/gcc.dg/double_sized_mul-1.c
+ create mode 100644 gcc/testsuite/gcc.dg/double_sized_mul-2.c
+
+diff --git a/gcc/match.pd b/gcc/match.pd
+index 3cbaf2a5b..61866cb90 100644
+--- a/gcc/match.pd
+++ b/gcc/match.pd
+@@ -7895,3 +7895,139 @@ and,
+ 	       == TYPE_UNSIGNED (TREE_TYPE (@3))))
+        && single_use (@4)
+        && single_use (@5))))
+
+/* Match multiplication with double sized result.
+
+   Consider the following calculations:
+   arg0 * arg1 = (2^(bit_size/2) * arg0_hi + arg0_lo)
+	       * (2^(bit_size/2) * arg1_hi + arg1_lo)
+   arg0 * arg1 = 2^bit_size * arg0_hi * arg1_hi
+	       + 2^(bit_size/2) * (arg0_hi * arg1_lo + arg0_lo * arg1_hi)
+	       + arg0_lo * arg1_lo
+
+   The products of high and low parts fits in bit_size values, thus they are
+   placed in high and low parts of result respectively.
+
+   The sum of the mixed products may overflow, so we need a detection for that.
+   Also it has a bit_size/2 offset, thus it intersects with both high and low
+   parts of result.  Overflow detection constant is bit_size/2 due to this.
+
+   With this info:
+   arg0 * arg1 = 2^bit_size * arg0_hi * arg1_hi
+	       + 2^(bit_size/2) * middle
+	       + 2^bit_size * possible_middle_overflow
+	       + arg0_lo * arg1_lo
+   arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + possible_middle_overflow)
+	       + 2^(bit_size/2) * (2^(bit_size/2) * middle_hi + middle_lo)
+	       + arg0_lo * arg1_lo
+   arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + middle_hi
+	       +	       possible_middle_overflow)
+	       + 2^(bit_size/2) * middle_lo
+	       + arg0_lo * arg1_lo
+
+   The last sum can produce overflow for the high result part.  With this:
+   arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + possible_middle_overflow
+	       +	       possible_res_lo_overflow + middle_hi)
+	       + res_lo
+	       = res_hi + res_lo
+
+   This formula is quite big to fit into one match pattern with all of the
+   combinations of terms inside it.  There are many helpers for better code
+   readability.
+
+   The simplification basis is res_hi: assuming that res_lo only is not
+   real practical case for such calculations.
+
+   Overflow handling is done via matching complex calculations:
+   the realpart and imagpart are quite handy here.  */
+/* Match low and high parts of the argument.  */
+(match (double_size_mul_arg_lo @0 @1)
+ (bit_and @0 INTEGER_CST@1)
+  (if (wi::to_wide (@1)
+       == wi::mask (TYPE_PRECISION (type) / 2, false, TYPE_PRECISION (type)))))
+(match (double_size_mul_arg_hi @0 @1)
+ (rshift @0 INTEGER_CST@1)
+  (if (wi::to_wide (@1) == TYPE_PRECISION (type) / 2)))
+
+/* Match various argument parts products.  */
+(match (double_size_mul_lolo @0 @1)
+ (mult@4 (double_size_mul_arg_lo @0 @2) (double_size_mul_arg_lo @1 @3))
+  (if (single_use (@4))))
+(match (double_size_mul_hihi @0 @1)
+ (mult@4 (double_size_mul_arg_hi @0 @2) (double_size_mul_arg_hi @1 @3))
+  (if (single_use (@4))))
+(match (double_size_mul_lohi @0 @1)
+ (mult:c@4 (double_size_mul_arg_lo @0 @2) (double_size_mul_arg_hi @1 @3))
+  (if (single_use (@4))))
+
+/* Match complex middle sum.  */
+(match (double_size_mul_middle_complex @0 @1)
+ (IFN_ADD_OVERFLOW@2 (double_size_mul_lohi @0 @1) (double_size_mul_lohi @1 @0))
+  (if (num_imm_uses (@2) == 2)))
+
+/* Match real middle results.  */
+(match (double_size_mul_middle @0 @1)
+ (realpart@2 (double_size_mul_middle_complex @0 @1))
+  (if (num_imm_uses (@2) == 2)))
+(match (double_size_mul_middleres_lo @0 @1)
+ (lshift@3 (double_size_mul_middle @0 @1) INTEGER_CST@2)
+  (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2
+       && single_use (@3))))
+(match (double_size_mul_middleres_hi @0 @1)
+ (rshift@3 (double_size_mul_middle @0 @1) INTEGER_CST@2)
+  (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2
+       && single_use (@3))))
+
+/* Match low result part.  */
+/* Number of uses may be < 2 in case when we are interested in
+   high part only.  */
+(match (double_size_mul_res_lo_complex @0 @1)
+ (IFN_ADD_OVERFLOW:c@2
+  (double_size_mul_lolo:c @0 @1) (double_size_mul_middleres_lo @0 @1))
+  (if (num_imm_uses (@2) <= 2)))
+(match (double_size_mul_res_lo @0 @1)
+ (realpart (double_size_mul_res_lo_complex @0 @1)))
+
+/* Match overflow terms.  */
+(match (double_size_mul_overflow_check_lo @0 @1 @5)
+ (convert@4 (ne@3
+  (imagpart@2 (double_size_mul_res_lo_complex@5 @0 @1)) integer_zerop))
+  (if (single_use (@2) && single_use (@3) && single_use (@4))))
+(match (double_size_mul_overflow_check_hi @0 @1)
+ (lshift@6 (convert@5 (ne@4
+  (imagpart@3 (double_size_mul_middle_complex @0 @1)) integer_zerop))
+	   INTEGER_CST@2)
+  (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2
+       && single_use (@3) && single_use (@4) && single_use (@5)
+       && single_use (@6))))
+
+/* Match all possible permutations for high result part calculations.  */
+(for op1 (double_size_mul_hihi
+	  double_size_mul_overflow_check_hi
+	  double_size_mul_middleres_hi)
+     op2 (double_size_mul_overflow_check_hi
+	  double_size_mul_middleres_hi
+	  double_size_mul_hihi)
+     op3 (double_size_mul_middleres_hi
+	  double_size_mul_hihi
+	  double_size_mul_overflow_check_hi)
+ (match (double_size_mul_candidate @0 @1 @2 @3)
+  (plus:c@2
+   (plus:c@4 (double_size_mul_overflow_check_lo @0 @1 @3) (op1:c @0 @1))
+   (plus:c@5 (op2:c @0 @1) (op3:c @0 @1)))
+    (if (single_use (@4) && single_use (@5))))
+ (match (double_size_mul_candidate @0 @1 @2 @3)
+  (plus:c@2 (double_size_mul_overflow_check_lo @0 @1 @3)
+   (plus:c@4 (op1:c @0 @1)
+    (plus:c@5 (op2:c @0 @1) (op3:c @0 @1))))
+     (if (single_use (@4) && single_use (@5))))
+ (match (double_size_mul_candidate @0 @1 @2 @3)
+  (plus:c@2 (op1:c @0 @1)
+   (plus:c@4 (double_size_mul_overflow_check_lo @0 @1 @3)
+    (plus:c@5 (op2:c @0 @1) (op3:c @0 @1))))
+     (if (single_use (@4) && single_use (@5))))
+ (match (double_size_mul_candidate @0 @1 @2 @3)
+  (plus:c@2 (op1:c @0 @1)
+   (plus:c@4 (op2:c @0 @1)
+    (plus:c@5 (double_size_mul_overflow_check_lo @0 @1 @3) (op3:c @0 @1))))
+     (if (single_use (@4) && single_use (@5)))))
+diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-1.c b/gcc/testsuite/gcc.dg/double_sized_mul-1.c
+new file mode 100644
+index 000000000..4d475cc8a
+--- /dev/null
+++ b/gcc/testsuite/gcc.dg/double_sized_mul-1.c
+@@ -0,0 +1,141 @@
+/* { dg-do compile } */
+/* fif-conversion-gimple and fuaddsub-overflow-match-all are required for
+   proper overflow detection in some cases.  */
+/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */
+#include <stdint.h>
+
+typedef unsigned __int128 uint128_t;
+
+uint16_t mul16 (uint8_t a, uint8_t b)
+{
+  uint8_t a_lo = a & 0xF;
+  uint8_t b_lo = b & 0xF;
+  uint8_t a_hi = a >> 4;
+  uint8_t b_hi = b >> 4;
+  uint8_t lolo = a_lo * b_lo;
+  uint8_t lohi = a_lo * b_hi;
+  uint8_t hilo = a_hi * b_lo;
+  uint8_t hihi = a_hi * b_hi;
+  uint8_t middle = hilo + lohi;
+  uint8_t middle_hi = middle >> 4;
+  uint8_t middle_lo = middle << 4;
+  uint8_t res_lo = lolo + middle_lo;
+  uint8_t res_hi = hihi + middle_hi;
+  res_hi += (res_lo < middle_lo ? 1 : 0);
+  res_hi += (middle < hilo ? 0x10 : 0);
+  uint16_t res = ((uint16_t) res_hi) << 8;
+  res += res_lo;
+  return res;
+}
+
+uint32_t mul32 (uint16_t a, uint16_t b)
+{
+  uint16_t a_lo = a & 0xFF;
+  uint16_t b_lo = b & 0xFF;
+  uint16_t a_hi = a >> 8;
+  uint16_t b_hi = b >> 8;
+  uint16_t lolo = a_lo * b_lo;
+  uint16_t lohi = a_lo * b_hi;
+  uint16_t hilo = a_hi * b_lo;
+  uint16_t hihi = a_hi * b_hi;
+  uint16_t middle = hilo + lohi;
+  uint16_t middle_hi = middle >> 8;
+  uint16_t middle_lo = middle << 8;
+  uint16_t res_lo = lolo + middle_lo;
+  uint16_t res_hi = hihi + middle_hi;
+  res_hi += (res_lo < middle_lo ? 1 : 0);
+  res_hi += (middle < hilo ? 0x100 : 0);
+  uint32_t res = ((uint32_t) res_hi) << 16;
+  res += res_lo;
+  return res;
+}
+
+uint64_t mul64 (uint32_t a, uint32_t b)
+{
+  uint32_t a_lo = a & 0xFFFF;
+  uint32_t b_lo = b & 0xFFFF;
+  uint32_t a_hi = a >> 16;
+  uint32_t b_hi = b >> 16;
+  uint32_t lolo = a_lo * b_lo;
+  uint32_t lohi = a_lo * b_hi;
+  uint32_t hilo = a_hi * b_lo;
+  uint32_t hihi = a_hi * b_hi;
+  uint32_t middle = hilo + lohi;
+  uint32_t middle_hi = middle >> 16;
+  uint32_t middle_lo = middle << 16;
+  uint32_t res_lo = lolo + middle_lo;
+  uint32_t res_hi = hihi + middle_hi;
+  res_hi += (res_lo < middle_lo ? 1 : 0);
+  res_hi += (middle < hilo ? 0x10000 : 0);
+  uint64_t res = ((uint64_t) res_hi) << 32;
+  res += res_lo;
+  return res;
+}
+
+uint128_t mul128 (uint64_t a, uint64_t b)
+{
+  uint64_t a_lo = a & 0xFFFFFFFF;
+  uint64_t b_lo = b & 0xFFFFFFFF;
+  uint64_t a_hi = a >> 32;
+  uint64_t b_hi = b >> 32;
+  uint64_t lolo = a_lo * b_lo;
+  uint64_t lohi = a_lo * b_hi;
+  uint64_t hilo = a_hi * b_lo;
+  uint64_t hihi = a_hi * b_hi;
+  uint64_t middle = hilo + lohi;
+  uint64_t middle_hi = middle >> 32;
+  uint64_t middle_lo = middle << 32;
+  uint64_t res_lo = lolo + middle_lo;
+  uint64_t res_hi = hihi + middle_hi;
+  res_hi += (res_lo < middle_lo ? 1 : 0);
+  res_hi += (middle < hilo ? 0x100000000 : 0);
+  uint128_t res = ((uint128_t) res_hi) << 64;
+  res += res_lo;
+  return res;
+}
+
+uint64_t mul64_perm (uint32_t a, uint32_t b)
+{
+  uint32_t a_lo = a & 0xFFFF;
+  uint32_t b_lo = b & 0xFFFF;
+  uint32_t a_hi = a >> 16;
+  uint32_t b_hi = b >> 16;
+  uint32_t lolo = a_lo * b_lo;
+  uint32_t lohi = a_lo * b_hi;
+  uint32_t hilo = a_hi * b_lo;
+  uint32_t hihi = a_hi * b_hi;
+  uint32_t middle = hilo + lohi;
+  uint32_t middle_hi = middle >> 16;
+  uint32_t middle_lo = middle << 16;
+  uint32_t res_lo = lolo + middle_lo;
+  uint32_t res_hi = hihi + middle_hi;
+  res_hi = res_lo < middle_lo ? res_hi + 1 : res_hi;
+  res_hi = middle < hilo ? res_hi + 0x10000 : res_hi;
+  uint64_t res = ((uint64_t) res_hi) << 32;
+  res += res_lo;
+  return res;
+}
+
+uint128_t mul128_perm (uint64_t a, uint64_t b)
+{
+  uint64_t a_lo = a & 0xFFFFFFFF;
+  uint64_t b_lo = b & 0xFFFFFFFF;
+  uint64_t a_hi = a >> 32;
+  uint64_t b_hi = b >> 32;
+  uint64_t lolo = a_lo * b_lo;
+  uint64_t lohi = a_lo * b_hi;
+  uint64_t hilo = a_hi * b_lo;
+  uint64_t hihi = a_hi * b_hi;
+  uint64_t middle = hilo + lohi;
+  uint64_t middle_hi = middle >> 32;
+  uint64_t middle_lo = middle << 32;
+  uint64_t res_lo = lolo + middle_lo;
+  uint64_t res_hi = hihi + middle_hi;
+  res_hi = res_lo < middle_lo ? res_hi + 1 : res_hi;
+  res_hi = middle < hilo ? res_hi + 0x100000000 : res_hi;
+  uint128_t res = ((uint128_t) res_hi) << 64;
+  res += res_lo;
+  return res;
+}
+
+/* { dg-final { scan-tree-dump-times "double sized mul optimized: 1" 6 "widening_mul" } } */
+diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-2.c b/gcc/testsuite/gcc.dg/double_sized_mul-2.c
+new file mode 100644
+index 000000000..cc6e5af25
+--- /dev/null
+++ b/gcc/testsuite/gcc.dg/double_sized_mul-2.c
+@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* fif-conversion-gimple is required for proper overflow detection
+   in some cases.  */
+/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */
+#include <stdint.h>
+
+typedef unsigned __int128 uint128_t;
+typedef struct uint256_t
+{
+    uint128_t lo;
+    uint128_t hi;
+} uint256_t;
+
+uint64_t mul64_double_use (uint32_t a, uint32_t b)
+{
+  uint32_t a_lo = a & 0xFFFF;
+  uint32_t b_lo = b & 0xFFFF;
+  uint32_t a_hi = a >> 16;
+  uint32_t b_hi = b >> 16;
+  uint32_t lolo = a_lo * b_lo;
+  uint32_t lohi = a_lo * b_hi;
+  uint32_t hilo = a_hi * b_lo;
+  uint32_t hihi = a_hi * b_hi;
+  uint32_t middle = hilo + lohi;
+  uint32_t middle_hi = middle >> 16;
+  uint32_t middle_lo = middle << 16;
+  uint32_t res_lo = lolo + middle_lo;
+  uint32_t res_hi = hihi + middle_hi;
+  res_hi += (res_lo < middle_lo ? 1 : 0);
+  res_hi += (middle < hilo ? 0x10000 : 0);
+  uint64_t res = ((uint64_t) res_hi) << 32;
+  res += res_lo;
+  return res + lolo;
+}
+
+uint256_t mul256 (uint128_t a, uint128_t b)
+{
+  uint128_t a_lo = a & 0xFFFFFFFFFFFFFFFF;
+  uint128_t b_lo = b & 0xFFFFFFFFFFFFFFFF;
+  uint128_t a_hi = a >> 64;
+  uint128_t b_hi = b >> 64;
+  uint128_t lolo = a_lo * b_lo;
+  uint128_t lohi = a_lo * b_hi;
+  uint128_t hilo = a_hi * b_lo;
+  uint128_t hihi = a_hi * b_hi;
+  uint128_t middle = hilo + lohi;
+  uint128_t middle_hi = middle >> 64;
+  uint128_t middle_lo = middle << 64;
+  uint128_t res_lo = lolo + middle_lo;
+  uint128_t res_hi = hihi + middle_hi;
+  res_hi += (res_lo < middle_lo ? 1 : 0);
+  /* Constant is to big warning WA */
+  uint128_t overflow_tmp = (middle < hilo ? 1 : 0);
+  overflow_tmp <<= 64;
+  res_hi += overflow_tmp;
+  uint256_t res;
+  res.lo = res_lo;
+  res.hi = res_hi;
+  return res;
+}
+
+/* { dg-final { scan-tree-dump-not "double sized mul optimized" "widening_mul" } } */
+diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
+index 55d6ee8ae..2c06b8a60 100644
+--- a/gcc/tree-ssa-math-opts.cc
+++ b/gcc/tree-ssa-math-opts.cc
+@@ -210,6 +210,9 @@ static struct
+ 
+   /* Number of highpart multiplication ops inserted.  */
+   int highpart_mults_inserted;
+
+  /* Number of optimized double sized multiplications.  */
+  int double_sized_mul_optimized;
+ } widen_mul_stats;
+ 
+ /* The instance of "struct occurrence" representing the highest
+@@ -4893,6 +4896,78 @@ optimize_spaceship (gimple *stmt)
+ }
+ 
+ 
+/* Pattern matcher for double sized multiplication defined in match.pd.  */
+extern bool gimple_double_size_mul_candidate (tree, tree*, tree (*)(tree));
+
+static bool
+convert_double_size_mul (gimple_stmt_iterator *gsi, gimple *stmt)
+{
+  gimple *use_stmt, *complex_res_lo;
+  gimple_stmt_iterator insert_before;
+  imm_use_iterator use_iter;
+  tree match[4]; // arg0, arg1, res_hi, complex_res_lo
+  tree arg0, arg1, widen_mult, new_type, tmp;
+  tree lhs = gimple_assign_lhs (stmt);
+  location_t loc = UNKNOWN_LOCATION;
+  machine_mode mode;
+
+  if (!gimple_double_size_mul_candidate (lhs, match, NULL))
+    return false;
+
+  new_type = build_nonstandard_integer_type (
+	  TYPE_PRECISION (TREE_TYPE (match[0])) * 2, 1);
+  mode = TYPE_MODE (new_type);
+
+  /* Early return if the target multiplication doesn't exist on target.  */
+  if (optab_handler (smul_optab, mode) == CODE_FOR_nothing
+      && !wider_optab_check_p (smul_optab, mode, 1))
+    return false;
+
+  /* Determine the point where the wide multiplication
+     should be inserted.  Complex low res is OK since it is required
+     by both high and low part getters, thus it dominates both of them.  */
+  complex_res_lo = SSA_NAME_DEF_STMT (match[3]);
+  insert_before = gsi_for_stmt (complex_res_lo);
+  gsi_next (&insert_before);
+
+  /* Create the widen multiplication.  */
+  arg0 = build_and_insert_cast (&insert_before, loc, new_type, match[0]);
+  arg1 = build_and_insert_cast (&insert_before, loc, new_type, match[1]);
+  widen_mult = build_and_insert_binop (&insert_before, loc, "widen_mult",
+				       MULT_EXPR, arg0, arg1);
+
+  /* Find the mult low part getter.  */
+  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, match[3])
+    if (gimple_assign_rhs_code (use_stmt) == REALPART_EXPR)
+      break;
+
+  /* Create high and low (if needed) parts extractors.  */
+  /* Low part.  */
+  if (use_stmt)
+    {
+      loc = gimple_location (use_stmt);
+      tmp = build_and_insert_cast (&insert_before, loc,
+	  	      		   TREE_TYPE (gimple_get_lhs (use_stmt)),
+	  			   widen_mult);
+      gassign *new_stmt = gimple_build_assign (gimple_get_lhs (use_stmt),
+	    				       NOP_EXPR, tmp);
+      gsi_replace (&insert_before, new_stmt, true);
+    }
+
+  /* High part.  */
+  loc = gimple_location (stmt);
+  tmp = build_and_insert_binop (gsi, loc, "widen_mult_hi",
+				RSHIFT_EXPR, widen_mult,
+				build_int_cst (new_type,
+					       TYPE_PRECISION (new_type) / 2));
+  tmp = build_and_insert_cast (gsi, loc, TREE_TYPE (lhs), tmp);
+  gassign *new_stmt = gimple_build_assign (lhs, NOP_EXPR, tmp);
+  gsi_replace (gsi, new_stmt, true);
+
+  widen_mul_stats.double_sized_mul_optimized++;
+  return true;
+}
+
+ /* Find integer multiplications where the operands are extended from
+    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
+    or MULT_HIGHPART_EXPR where appropriate.  */
+@@ -4987,6 +5062,9 @@ math_opts_dom_walker::after_dom_children (basic_block bb)
+ 	      break;
+ 
+ 	    case PLUS_EXPR:
+	      if (convert_double_size_mul (&gsi, stmt))
+		break;
+	      __attribute__ ((fallthrough));
+ 	    case MINUS_EXPR:
+ 	      if (!convert_plusminus_to_widen (&gsi, stmt, code))
+ 		match_arith_overflow (&gsi, stmt, code, m_cfg_changed_p);
+@@ -5091,6 +5169,8 @@ pass_optimize_widening_mul::execute (function *fun)
+ 			    widen_mul_stats.divmod_calls_inserted);
+   statistics_counter_event (fun, "highpart multiplications inserted",
+ 			    widen_mul_stats.highpart_mults_inserted);
+  statistics_counter_event (fun, "double sized mul optimized",
+			    widen_mul_stats.double_sized_mul_optimized);
+ 
+   return cfg_changed ? TODO_cleanup_cfg : 0;
+ }
+-- 
+2.33.0
+
--- a/0040-Port-icp-patch-to-GCC-12.patch
+++ b/0040-Port-icp-patch-to-GCC-12.patch
--- a/0041-Port-fixes-in-icp-to-GCC-12.patch
+++ b/0041-Port-fixes-in-icp-to-GCC-12.patch
@ -0,0 +1,100 @@
+From aaa117a9ff58fb208e8c8859e075ca425f995f63 Mon Sep 17 00:00:00 2001
+From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
+Date: Tue, 27 Feb 2024 07:43:57 +0800
+Subject: [PATCH 07/18] Port fixes in icp to GCC 12
+
+---
+ gcc/ipa-devirt.cc | 37 ++++++++++++++++++++++++++++++-------
+ 1 file changed, 30 insertions(+), 7 deletions(-)
+
+diff --git a/gcc/ipa-devirt.cc b/gcc/ipa-devirt.cc
+index 383839189..318535d06 100644
+--- a/gcc/ipa-devirt.cc
+++ b/gcc/ipa-devirt.cc
+@@ -4431,6 +4431,11 @@ print_type_set(unsigned ftype_uid, type_alias_map *map)
+   if (!map->count (ftype_uid))
+     return;
+   type_set* s = (*map)[ftype_uid];
+  if (!s)
+    {
+      fprintf (dump_file, "%d (no set)", ftype_uid);
+      return;
+    }
+   for (type_set::const_iterator it = s->begin (); it != s->end (); it++)
+     fprintf (dump_file, it == s->begin () ? "%d" : ", %d", *it);
+ }
+@@ -4696,12 +4701,19 @@ maybe_register_aliases (tree type1, tree type2)
+       if (register_ailas_type (type1, type2, ta_map))
+ 	analyze_pointees (type1, type2);
+     }
+  unsigned type1_uid = TYPE_UID (type1);
+  unsigned type2_uid = TYPE_UID (type2);
+  if (type_uid_map->count (type1_uid) == 0)
+    (*type_uid_map)[type1_uid] = type1;
+  if (type_uid_map->count (type2_uid) == 0)
+    (*type_uid_map)[type2_uid] = type2;
+
+   /* If function and non-function type pointers alias,
+      the function type is unsafe.  */
+   if (FUNCTION_POINTER_TYPE_P (type1) && !FUNCTION_POINTER_TYPE_P (type2))
+-    unsafe_types->insert (TYPE_UID (type1));
+    unsafe_types->insert (type1_uid);
+   if (FUNCTION_POINTER_TYPE_P (type2) && !FUNCTION_POINTER_TYPE_P (type1))
+-    unsafe_types->insert (TYPE_UID (type2));
+    unsafe_types->insert (type2_uid);
+ 
+   /* Try to figure out with pointers to incomplete types.  */
+   if (POINTER_TYPE_P (type1) && POINTER_TYPE_P (type2))
+@@ -4825,10 +4837,12 @@ compare_block_and_init_type (tree block, tree t1)
+ static void
+ analyze_global_var (varpool_node *var)
+ {
+-  var->get_constructor();
+   tree decl = var->decl;
+-  if (TREE_CODE (decl) == SSA_NAME || !DECL_INITIAL (decl)
+-      || integer_zerop (DECL_INITIAL (decl)))
+  if (decl || !DECL_INITIAL (decl))
+    return;
+  var->get_constructor ();
+  if (TREE_CODE (decl) == SSA_NAME || integer_zerop (DECL_INITIAL (decl))
+      || TREE_CODE (DECL_INITIAL (decl)) == ERROR_MARK)
+     return;
+ 
+   if (dump_file && (dump_flags & TDF_DETAILS))
+@@ -4998,7 +5012,9 @@ analyze_assign_stmt (gimple *stmt)
+     {
+       rhs = TREE_OPERAND (rhs, 0);
+       if (VAR_OR_FUNCTION_DECL_P (rhs) || TREE_CODE (rhs) == STRING_CST
+-	  || TREE_CODE (rhs) == ARRAY_REF || TREE_CODE (rhs) == PARM_DECL)
+	  || TREE_CODE (rhs) == ARRAY_REF || TREE_CODE (rhs) == PARM_DECL
+	  || TREE_CODE (rhs) == LABEL_DECL || TREE_CODE (rhs) == CONST_DECL
+	  || TREE_CODE (rhs) == RESULT_DECL)
+ 	rhs_type = build_pointer_type (TREE_TYPE (rhs));
+       else if (TREE_CODE (rhs) == COMPONENT_REF)
+ 	{
+@@ -5012,7 +5028,12 @@ analyze_assign_stmt (gimple *stmt)
+ 	  gcc_assert (POINTER_TYPE_P (rhs_type));
+ 	}
+       else
+-	gcc_unreachable();
+	{
+	  fprintf (dump_file, "\nUnsupported rhs type %s in assign stmt: ",
+		   get_tree_code_name (TREE_CODE (rhs)));
+	  print_gimple_stmt (dump_file, stmt, 0);
+	  gcc_unreachable ();
+	}
+     }
+   else
+     rhs_type = TREE_TYPE (rhs);
+@@ -5710,6 +5731,8 @@ merge_fs_map_for_ftype_aliases ()
+       decl_set *d_set = it1->second;
+       tree type = (*type_uid_map)[it1->first];
+       type_set *set = (*fta_map)[it1->first];
+      if (!set)
+	continue;
+       for (type_set::const_iterator it2 = set->begin ();
+ 	   it2 != set->end (); it2++)
+ 	{
+-- 
+2.33.0
+
--- a/0042-Add-split-complex-instructions-pass.patch
+++ b/0042-Add-split-complex-instructions-pass.patch
--- a/0043-Extending-and-refactoring-of-pass_split_complex_inst.patch
+++ b/0043-Extending-and-refactoring-of-pass_split_complex_inst.patch
--- a/0044-Port-maxmin-patch-to-GCC-12.patch
+++ b/0044-Port-maxmin-patch-to-GCC-12.patch
@ -0,0 +1,378 @@
+From a3013c074cd2ab5f71eb98a587a627f38c68656c Mon Sep 17 00:00:00 2001
+From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
+Date: Thu, 22 Feb 2024 17:07:24 +0800
+Subject: [PATCH 12/18] Port maxmin patch to GCC 12
+
+---
+ gcc/config/aarch64/aarch64-simd.md    | 256 ++++++++++++++++++++++++++
+ gcc/config/aarch64/predicates.md      |  19 ++
+ gcc/testsuite/gcc.dg/combine-maxmin.c |  46 +++++
+ 3 files changed, 321 insertions(+)
+ create mode 100755 gcc/testsuite/gcc.dg/combine-maxmin.c
+
+diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
+index 82f73805f..de92802f5 100644
+--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
+@@ -1138,6 +1138,82 @@
+   [(set_attr "type" "neon_compare<q>,neon_shift_imm<q>")]
+ )
+ 
+;; Simplify the extension with following truncation for shift+neg operation.
+
+(define_insn_and_split "*aarch64_sshr_neg_v8hi"
+  [(set (match_operand:V8HI 0 "register_operand" "=w")
+	(vec_concat:V8HI
+	  (truncate:V4HI
+	    (ashiftrt:V4SI
+	      (neg:V4SI
+		(sign_extend:V4SI
+		  (vec_select:V4HI
+		    (match_operand:V8HI 1 "register_operand")
+		    (match_operand:V8HI 3 "vect_par_cnst_lo_half"))))
+	      (match_operand:V4SI 2 "maxmin_arith_shift_operand")))
+	  (truncate:V4HI
+	    (ashiftrt:V4SI
+	      (neg:V4SI
+		(sign_extend:V4SI
+		  (vec_select:V4HI
+		    (match_dup 1)
+		    (match_operand:V8HI 4 "vect_par_cnst_hi_half"))))
+	      (match_dup 2)))))]
+  "TARGET_SIMD"
+  "#"
+  "&& true"
+  [(set (match_operand:V8HI 0 "register_operand" "=w")
+	(ashiftrt:V8HI
+	  (neg:V8HI
+	    (match_operand:V8HI 1 "register_operand" "w"))
+	  (match_operand:V8HI 2 "aarch64_simd_imm_minus_one")))]
+  {
+    /* Reduce the shift amount to smaller mode.  */
+    int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[2], 0))
+	      - (GET_MODE_UNIT_BITSIZE (GET_MODE (operands[2])) / 2);
+    operands[2] = aarch64_simd_gen_const_vector_dup (V8HImode, val);
+  }
+  [(set_attr "type" "multiple")]
+)
+
+;; The helper definition that allows combiner to use the previous pattern.
+
+(define_insn_and_split "*aarch64_sshr_neg_tmpv8hi"
+  [(set (match_operand:V8HI 0 "register_operand" "=w")
+	(vec_concat:V8HI
+	  (truncate:V4HI
+	    (ashiftrt:V4SI
+	      (neg:V4SI
+		(match_operand:V4SI 1 "register_operand" "w"))
+	      (match_operand:V4SI 2 "maxmin_arith_shift_operand")))
+	  (truncate:V4HI
+	    (ashiftrt:V4SI
+	      (neg:V4SI
+		(match_operand:V4SI 3 "register_operand" "w"))
+	      (match_dup 2)))))]
+  "TARGET_SIMD"
+  "#"
+  "&& true"
+  [(set (match_operand:V4SI 1 "register_operand" "=w")
+	(ashiftrt:V4SI
+	  (neg:V4SI
+	    (match_dup 1))
+	  (match_operand:V4SI 2 "maxmin_arith_shift_operand")))
+   (set (match_operand:V4SI 3 "register_operand" "=w")
+	(ashiftrt:V4SI
+	  (neg:V4SI
+	    (match_dup 3))
+	  (match_dup 2)))
+   (set (match_operand:V8HI 0 "register_operand" "=w")
+	(vec_concat:V8HI
+	  (truncate:V4HI
+	    (match_dup 1))
+	  (truncate:V4HI
+	    (match_dup 3))))]
+  ""
+  [(set_attr "type" "multiple")]
+)
+
+ (define_insn "*aarch64_simd_sra<mode>"
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
+ 	(plus:VDQ_I
+@@ -1714,6 +1790,26 @@
+  }
+ )
+ 
+(define_insn "vec_pack_trunc_shifted_<mode>"
+ [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=&w")
+       (vec_concat:<VNARROWQ2>
+	 (truncate:<VNARROWQ>
+	   (ashiftrt:VQN (match_operand:VQN 1 "register_operand" "w")
+	      (match_operand:VQN 2 "half_size_operand" "w")))
+	 (truncate:<VNARROWQ>
+	   (ashiftrt:VQN (match_operand:VQN 3 "register_operand" "w")
+	      (match_operand:VQN 4 "half_size_operand" "w")))))]
+ "TARGET_SIMD"
+ {
+   if (BYTES_BIG_ENDIAN)
+     return "uzp2\\t%0.<V2ntype>, %3.<V2ntype>, %1.<V2ntype>";
+   else
+     return "uzp2\\t%0.<V2ntype>, %1.<V2ntype>, %3.<V2ntype>";
+ }
+  [(set_attr "type" "neon_permute<q>")
+   (set_attr "length" "4")]
+)
+
+ (define_insn "aarch64_shrn<mode>_insn_le"
+   [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
+ 	(vec_concat:<VNARROWQ2>
+@@ -6652,6 +6748,166 @@
+   [(set_attr "type" "neon_tst<q>")]
+ )
+ 
+;; Simplify the extension with following truncation for cmtst-like operation.
+
+(define_insn_and_split "*aarch64_cmtst_arith_v8hi"
+  [(set (match_operand:V8HI 0 "register_operand" "=w")
+	(vec_concat:V8HI
+	  (plus:V4HI
+	    (truncate:V4HI
+	      (eq:V4SI
+		(sign_extend:V4SI
+		  (vec_select:V4HI
+		    (and:V8HI
+		      (match_operand:V8HI 1 "register_operand")
+		      (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
+		    (match_operand:V8HI 3 "vect_par_cnst_lo_half")))
+		(match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero")))
+	    (match_operand:V4HI 5 "aarch64_simd_imm_minus_one"))
+	  (plus:V4HI
+	    (truncate:V4HI
+	      (eq:V4SI
+		(sign_extend:V4SI
+		  (vec_select:V4HI
+		    (and:V8HI
+		      (match_dup 1)
+		      (match_dup 2))
+		    (match_operand:V8HI 6 "vect_par_cnst_hi_half")))
+		(match_dup 4)))
+	    (match_dup 5))))]
+  "TARGET_SIMD && !reload_completed"
+  "#"
+  "&& true"
+  [(set (match_operand:V8HI 6 "register_operand" "=w")
+	(match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
+   (set (match_operand:V8HI 0 "register_operand" "=w")
+	(plus:V8HI
+	  (eq:V8HI
+	    (and:V8HI
+	      (match_operand:V8HI 1 "register_operand" "w")
+	      (match_dup 6))
+	    (match_operand:V8HI 4 "aarch64_simd_imm_zero"))
+	  (match_operand:V8HI 5 "aarch64_simd_imm_minus_one")))]
+  {
+    if (can_create_pseudo_p ())
+      {
+	int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[4], 0));
+	operands[4] = aarch64_simd_gen_const_vector_dup (V8HImode, val);
+	int val2 = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[5], 0));
+	operands[5] = aarch64_simd_gen_const_vector_dup (V8HImode, val2);
+
+	operands[6] = gen_reg_rtx (V8HImode);
+      }
+    else
+      FAIL;
+  }
+  [(set_attr "type" "neon_tst_q")]
+)
+
+;; Three helper definitions that allow combiner to use the previous pattern.
+
+(define_insn_and_split "*aarch64_cmtst_arith_tmp_lo_v8hi"
+  [(set (match_operand:V4SI 0 "register_operand" "=w")
+	(neg:V4SI
+	  (eq:V4SI
+	    (sign_extend:V4SI
+	      (vec_select:V4HI
+		(and:V8HI
+		  (match_operand:V8HI 1 "register_operand")
+		  (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
+		(match_operand:V8HI 3 "vect_par_cnst_lo_half")))
+	    (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
+  "TARGET_SIMD && !reload_completed"
+  "#"
+  "&& true"
+  [(set (match_operand:V8HI 5 "register_operand" "=w")
+	(and:V8HI
+	  (match_operand:V8HI 1 "register_operand")
+	  (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")))
+   (set (match_operand:V4SI 0 "register_operand" "=w")
+	(sign_extend:V4SI
+	  (vec_select:V4HI
+	    (match_dup 5)
+	    (match_operand:V8HI 3 "vect_par_cnst_lo_half"))))
+   (set (match_dup 0)
+	(neg:V4SI
+	  (eq:V4SI
+	    (match_dup 0)
+	    (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
+  {
+    if (can_create_pseudo_p ())
+      operands[5] = gen_reg_rtx (V8HImode);
+    else
+      FAIL;
+  }
+  [(set_attr "type" "multiple")]
+)
+
+(define_insn_and_split "*aarch64_cmtst_arith_tmp_hi_v8hi"
+  [(set (match_operand:V4SI 0 "register_operand" "=w")
+	  (neg:V4SI
+	    (eq:V4SI
+	      (sign_extend:V4SI
+		(vec_select:V4HI
+		  (and:V8HI
+		    (match_operand:V8HI 1 "register_operand")
+		    (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
+		  (match_operand:V8HI 3 "vect_par_cnst_hi_half")))
+	      (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
+  "TARGET_SIMD && !reload_completed"
+  "#"
+  "&& true"
+  [(set (match_operand:V8HI 5 "register_operand" "=w")
+	(and:V8HI
+	  (match_operand:V8HI 1 "register_operand")
+	  (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")))
+   (set (match_operand:V4SI 0 "register_operand" "=w")
+	(sign_extend:V4SI
+	  (vec_select:V4HI
+	    (match_dup 5)
+	    (match_operand:V8HI 3 "vect_par_cnst_hi_half"))))
+   (set (match_dup 0)
+	  (neg:V4SI
+	    (eq:V4SI
+	      (match_dup 0)
+	      (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
+  {
+    if (can_create_pseudo_p ())
+      operands[5] = gen_reg_rtx (V8HImode);
+    else
+      FAIL;
+  }
+  [(set_attr "type" "multiple")]
+)
+
+(define_insn_and_split "*aarch64_cmtst_arith_tmpv8hi"
+  [(set (match_operand:V8HI 0 "register_operand" "=w")
+	(vec_concat:V8HI
+	  (truncate:V4HI
+	    (not:V4SI
+	      (match_operand:V4SI 1 "register_operand" "w")))
+	  (truncate:V4HI
+	    (not:V4SI
+	      (match_operand:V4SI 2 "register_operand" "w")))))]
+  "TARGET_SIMD"
+  "#"
+  "&& true"
+  [(set (match_operand:V4SI 1 "register_operand" "=w")
+	(not:V4SI
+	  (match_dup 1)))
+   (set (match_operand:V4SI 2 "register_operand" "=w")
+	(not:V4SI
+	  (match_dup 2)))
+   (set (match_operand:V8HI 0 "register_operand" "=w")
+	(vec_concat:V8HI
+	  (truncate:V4HI
+	    (match_dup 1))
+	  (truncate:V4HI
+	    (match_dup 2))))]
+  ""
+  [(set_attr "type" "multiple")]
+)
+
+ (define_insn_and_split "aarch64_cmtstdi"
+   [(set (match_operand:DI 0 "register_operand" "=w,r")
+ 	(neg:DI
+diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
+index 07c14aacb..1b8496c07 100644
+--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
+@@ -118,6 +118,25 @@
+ 	     (match_test "aarch64_simd_valid_immediate (op, NULL,
+ 							AARCH64_CHECK_ORR)"))))
+ 
+(define_predicate "aarch64_bic_imm_for_maxmin"
+   (match_code "const_vector")
+{
+  if (!aarch64_simd_valid_immediate (op, NULL, AARCH64_CHECK_BIC))
+    return false;
+  op = unwrap_const_vec_duplicate (op);
+  unsigned int size = GET_MODE_UNIT_BITSIZE (mode);
+  return CONST_INT_P (op)
+	 && ((~UINTVAL (op)) < (((long unsigned int) 1 << size) - 1));
+})
+
+(define_predicate "maxmin_arith_shift_operand"
+   (match_code "const_vector")
+{
+  op = unwrap_const_vec_duplicate (op);
+  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) - 1;
+  return CONST_INT_P (op) && (UINTVAL (op) == size);
+})
+
+ (define_predicate "aarch64_reg_or_bic_imm"
+    (ior (match_operand 0 "register_operand")
+ 	(and (match_code "const_vector")
+diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c
+new file mode 100755
+index 000000000..06bce7029
+--- /dev/null
+++ b/gcc/testsuite/gcc.dg/combine-maxmin.c
+@@ -0,0 +1,46 @@
+/* { dg-do compile { target aarch64-*-* } } */
+/* { dg-options "-O3 -fdump-rtl-combine-all" } */
+
+/* The test checks usage of smax/smin insns for clip evaluation and
+ * uzp1/uzp2 insns for vector element narrowing.  It's inspired by
+ * sources of x264 codec.  */
+
+typedef unsigned char uint8_t;
+typedef long int intptr_t;
+typedef signed short int int16_t;
+
+static __attribute__((always_inline)) inline uint8_t clip (int x )
+{
+    return ( (x & ~((1 << 8)-1)) ? (-x)>>31 & ((1 << 8)-1) : x );
+}
+
+void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
+	 intptr_t stride, int width, int height, int16_t *buf)
+{
+    const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0;
+    for( int y = 0; y < height; y++ ) {
+        for( int x = -2; x < width+3; x++ ) {
+            int v = ((src)[x-2*stride] + (src)[x+3*stride] - 5*((src)[x-stride]
+		     + (src)[x+2*stride]) + 20*((src)[x] + (src)[x+stride]));
+            dstv[x] = clip ( (v + 16) >> 5 );
+            buf[x+2] = v + pad;
+        }
+        for( int x = 0; x < width; x++ )
+            dstc[x] = clip ((((buf+2)[x-2*1] + (buf+2)[x+3*1] - 5*((buf+2)[x-1]
+			      + (buf+2)[x+2*1]) + 20*((buf+2)[x] + (buf+2)[x+1]))
+			     - 32*pad + 512) >> 10);
+        for( int x = 0; x < width; x++ )
+            dsth[x] = clip ((((src)[x-2*1] + (src)[x+3*1] - 5*((src)[x-1]
+			      + (src)[x+2*1]) + 20*((src)[x] + (src)[x+1]))
+			     + 16) >> 5);
+        dsth += stride;
+        dstv += stride;
+        dstc += stride;
+        src += stride;
+    }
+}
+
+/* { dg-final { scan-assembler-times {smax\t} 4 } }  */
+/* { dg-final { scan-assembler-times {smin\t} 4 } }  */
+/* { dg-final { scan-assembler-times {cmtst\t} 2 } }  */
+/* { dg-final { scan-assembler-times {uzp1\t} 6 } }  */
+-- 
+2.33.0
+
--- a/0045-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch
+++ b/0045-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch
@ -0,0 +1,239 @@
+From 11da40d18e35219961226d40f11b0702b8649044 Mon Sep 17 00:00:00 2001
+From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
+Date: Thu, 22 Feb 2024 17:13:27 +0800
+Subject: [PATCH 13/18] Port moving minmask pattern to gimple to GCC 12
+
+---
+ gcc/common.opt                          |   4 +
+ gcc/match.pd                            | 104 ++++++++++++++++++++++++
+ gcc/testsuite/gcc.dg/combine-maxmin-1.c |  15 ++++
+ gcc/testsuite/gcc.dg/combine-maxmin-2.c |  14 ++++
+ gcc/testsuite/gcc.dg/combine-maxmin.c   |  19 +++--
+ 5 files changed, 151 insertions(+), 5 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-1.c
+ create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-2.c
+
+diff --git a/gcc/common.opt b/gcc/common.opt
+index 6c6fabb31..3a5004271 100644
+--- a/gcc/common.opt
+++ b/gcc/common.opt
+@@ -1846,6 +1846,10 @@ fif-conversion-gimple
+ Common Var(flag_if_conversion_gimple) Optimization
+ Perform conversion of conditional jumps to branchless equivalents during gimple transformations.
+ 
+fconvert-minmax
+Common Var(flag_convert_minmax) Optimization
+Convert saturating clipping to min max.
+
+ fstack-reuse=
+ Common Joined RejectNegative Enum(stack_reuse_level) Var(flag_stack_reuse) Init(SR_ALL) Optimization
+ -fstack-reuse=[all|named_vars|none]	Set stack reuse level for local variables.
+diff --git a/gcc/match.pd b/gcc/match.pd
+index 61866cb90..3a19e93b3 100644
+--- a/gcc/match.pd
+++ b/gcc/match.pd
+@@ -8031,3 +8031,107 @@ and,
+    (plus:c@4 (op2:c @0 @1)
+     (plus:c@5 (double_size_mul_overflow_check_lo @0 @1 @3) (op3:c @0 @1))))
+      (if (single_use (@4) && single_use (@5)))))
+
+/* MinMax pattern matching helpers.  More info on the transformation below.  */
+
+/* Match (a & 0b11..100..0) pattern.  */
+(match (minmax_cmp_arg @0 @1)
+ (bit_and @0 INTEGER_CST@1)
+ (if (wi::popcount (~wi::to_widest (@1) + 1) == 1)))
+
+/* Match (inversed_sign_bit >> sign_bit_pos) pattern.
+   This statement is blocking for the transformation of unsigned integers.
+   Do type check here to avoid unnecessary duplications.  */
+(match (minmax_sat_arg @0)
+ (rshift (negate @0) INTEGER_CST@1)
+ (if (!TYPE_UNSIGNED (TREE_TYPE (@0))
+      && wi::eq_p (wi::to_widest (@1), TYPE_PRECISION (TREE_TYPE (@0)) - 1))))
+
+/* Transform ((x & ~mask) ? (-x)>>31 & mask : x) to (min (max (x, 0), mask)).
+   The matched pattern can be described as saturated clipping.
+
+   The pattern supports truncation via both casts and bit_and.
+   Also there are patterns for possible inverted conditions.  */
+(if (flag_convert_minmax)
+/* Truncation via casts.  Unfortunately convert? cannot be applied here
+   because convert and cond take different number of arguments.  */
+ (simplify
+  (convert
+   (cond
+    (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+    (convert? (minmax_sat_arg @0))
+    (convert? @0)))
+  (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
+   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+    (convert (min (max @0 { integer_zero_node; })
+		  { mask; })))))
+ (simplify
+  (cond
+   (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+   (convert? (minmax_sat_arg @0))
+   (convert? @0))
+  (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
+   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+    (convert (min (max @0 { integer_zero_node; })
+		  { mask; })))))
+
+ (simplify
+  (convert
+   (cond
+    (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+    (convert? @0)
+    (convert? (minmax_sat_arg @0))))
+  (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
+   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+    (convert (min (max @0 { integer_zero_node; })
+		  { mask; })))))
+ (simplify
+  (cond
+   (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+   (convert? @0)
+   (convert? (minmax_sat_arg @0)))
+  (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
+   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+    (convert (min (max @0 { integer_zero_node; })
+		  { mask; })))))
+
+ /* Truncation via bit_and with mask.  Same concerns on convert? here.  */
+ (simplify
+  (convert
+   (cond
+    (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+    (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))
+    (convert? @0)))
+  (if (wi::to_widest (@2) == ~wi::to_widest (@1))
+   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+    (convert (min (max @0 { integer_zero_node; })
+		  { mask; })))))
+ (simplify
+  (cond
+   (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+   (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))
+   (convert? @0))
+  (if (wi::to_widest (@2) == ~wi::to_widest (@1))
+   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+    (convert (min (max @0 { integer_zero_node; })
+		  { mask; })))))
+
+ (simplify
+  (convert
+   (cond
+    (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+    (convert? @0)
+    (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))))
+  (if (wi::to_widest (@2) == ~wi::to_widest (@1))
+   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+    (convert (min (max @0 { integer_zero_node; })
+		  { mask; })))))
+ (simplify
+  (cond
+   (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
+   (convert? @0)
+   (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2)))
+  (if (wi::to_widest (@2) == ~wi::to_widest (@1))
+   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
+    (convert (min (max @0 { integer_zero_node; })
+		  { mask; }))))))
+diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-1.c b/gcc/testsuite/gcc.dg/combine-maxmin-1.c
+new file mode 100644
+index 000000000..859ff7df8
+--- /dev/null
+++ b/gcc/testsuite/gcc.dg/combine-maxmin-1.c
+@@ -0,0 +1,15 @@
+/* { dg-do compile { target aarch64-*-* } } */
+/* { dg-options "-O3 -fconvert-minmax" } */
+
+#include <inttypes.h>
+
+__attribute__((noinline))
+void test (int32_t *restrict a, int32_t *restrict x)
+{
+  for (int i = 0; i < 4; i++)
+    a[i] = ((((-x[i]) >> 31) ^ x[i])
+            & (-((int32_t)((x[i] & (~((1 << 8)-1))) == 0)))) ^ ((-x[i]) >> 31);
+}
+
+/* { dg-final { scan-assembler-not {smax\t} } }  */
+/* { dg-final { scan-assembler-not {smin\t} } }  */
+diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-2.c b/gcc/testsuite/gcc.dg/combine-maxmin-2.c
+new file mode 100644
+index 000000000..63d4d85b3
+--- /dev/null
+++ b/gcc/testsuite/gcc.dg/combine-maxmin-2.c
+@@ -0,0 +1,14 @@
+/* { dg-do compile { target aarch64-*-* } } */
+/* { dg-options "-O3 -fconvert-minmax" } */
+
+#include <inttypes.h>
+
+__attribute__((noinline))
+void test (int8_t *restrict a, int32_t *restrict x)
+{
+  for (int i = 0; i < 8; i++)
+    a[i] = ((x[i] & ~((1 << 9)-1)) ? (-x[i])>>31 & ((1 << 9)-1) : x[i]);
+}
+
+/* { dg-final { scan-assembler-times {smax\t} 4 } }  */
+/* { dg-final { scan-assembler-times {smin\t} 4 } }  */
+diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c
+index 06bce7029..a984fa560 100755
+--- a/gcc/testsuite/gcc.dg/combine-maxmin.c
+++ b/gcc/testsuite/gcc.dg/combine-maxmin.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile { target aarch64-*-* } } */
+-/* { dg-options "-O3 -fdump-rtl-combine-all" } */
+/* { dg-options "-O3 -fconvert-minmax" } */
+ 
+ /* The test checks usage of smax/smin insns for clip evaluation and
+  * uzp1/uzp2 insns for vector element narrowing.  It's inspired by
+@@ -19,20 +19,26 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
+ {
+     const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0;
+     for( int y = 0; y < height; y++ ) {
+        /* This loop is not being vectorized now.  */
+         for( int x = -2; x < width+3; x++ ) {
+             int v = ((src)[x-2*stride] + (src)[x+3*stride] - 5*((src)[x-stride]
+ 		     + (src)[x+2*stride]) + 20*((src)[x] + (src)[x+stride]));
+             dstv[x] = clip ( (v + 16) >> 5 );
+             buf[x+2] = v + pad;
+         }
+
+        /* Produces two versions of the code: 3xUZP1/2xMAX/2xMIN + 1xUZP1/1xMAX/1xMIN.  */
+         for( int x = 0; x < width; x++ )
+             dstc[x] = clip ((((buf+2)[x-2*1] + (buf+2)[x+3*1] - 5*((buf+2)[x-1]
+ 			      + (buf+2)[x+2*1]) + 20*((buf+2)[x] + (buf+2)[x+1]))
+ 			     - 32*pad + 512) >> 10);
+
+        /* Priduces two versions of the code: 1xUZP1/2xMAX/2xMIN + 0xUZP1/1xMAX/1xMIN.  */
+         for( int x = 0; x < width; x++ )
+             dsth[x] = clip ((((src)[x-2*1] + (src)[x+3*1] - 5*((src)[x-1]
+ 			      + (src)[x+2*1]) + 20*((src)[x] + (src)[x+1]))
+ 			     + 16) >> 5);
+
+         dsth += stride;
+         dstv += stride;
+         dstc += stride;
+@@ -40,7 +46,10 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
+     }
+ }
+ 
+-/* { dg-final { scan-assembler-times {smax\t} 4 } }  */
+-/* { dg-final { scan-assembler-times {smin\t} 4 } }  */
+-/* { dg-final { scan-assembler-times {cmtst\t} 2 } }  */
+-/* { dg-final { scan-assembler-times {uzp1\t} 6 } }  */
+/* Max is performed on 0 from signed values, match smax exactly.  */
+/* { dg-final { scan-assembler-times {smax\t} 6 } }  */
+/* Min is performed on signed val>0 and a mask, min sign doesn't matter.  */
+/* { dg-final { scan-assembler-times {[us]min\t} 6 } }  */
+/* All of the vectorized patterns are expected to be matched.  */
+/* { dg-final { scan-assembler-not {cmtst\t} } }  */
+/* { dg-final { scan-assembler-times {uzp1\t} 5 } }  */
+-- 
+2.33.0
+
--- a/0046-Add-new-pattern-to-pass-the-maxmin-tests.patch
+++ b/0046-Add-new-pattern-to-pass-the-maxmin-tests.patch
@ -0,0 +1,65 @@
+From dbcb2630c426c8dd2117b5ce625da8422dd8cd65 Mon Sep 17 00:00:00 2001
+From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
+Date: Thu, 22 Feb 2024 17:20:17 +0800
+Subject: [PATCH 14/18] Add new pattern to pass the maxmin tests
+
+---
+ gcc/match.pd                          | 24 ++++++++++++++++++++++++
+ gcc/testsuite/gcc.dg/combine-maxmin.c |  2 +-
+ 2 files changed, 25 insertions(+), 1 deletion(-)
+
+diff --git a/gcc/match.pd b/gcc/match.pd
+index 3a19e93b3..aee58e47b 100644
+--- a/gcc/match.pd
+++ b/gcc/match.pd
+@@ -8038,6 +8038,10 @@ and,
+ (match (minmax_cmp_arg @0 @1)
+  (bit_and @0 INTEGER_CST@1)
+  (if (wi::popcount (~wi::to_widest (@1) + 1) == 1)))
+/* Match ((unsigned) a > 0b0..01..1) pattern.  */
+(match (minmax_cmp_arg1 @0 @1)
+ (gt @0 INTEGER_CST@1)
+ (if (wi::popcount (wi::to_widest (@1) + 1) == 1)))
+ 
+ /* Match (inversed_sign_bit >> sign_bit_pos) pattern.
+    This statement is blocking for the transformation of unsigned integers.
+@@ -8095,6 +8099,26 @@ and,
+     (convert (min (max @0 { integer_zero_node; })
+ 		  { mask; })))))
+ 
+ (simplify
+  (convert
+   (cond
+    (minmax_cmp_arg1 (convert? @0) INTEGER_CST@1)
+    (convert? (minmax_sat_arg @0))
+    (convert? @0)))
+  (if (wi::geu_p (wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
+   (with { tree mask = build_int_cst (integer_type_node, tree_to_shwi (@1)); }
+    (convert (min (max (convert:integer_type_node @0) { integer_zero_node; })
+		  { mask; })))))
+ (simplify
+  (cond
+   (minmax_cmp_arg1 (convert? @0) INTEGER_CST@1)
+   (convert? (minmax_sat_arg @0))
+   (convert? @0))
+  (if (wi::geu_p (wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
+   (with { tree mask = build_int_cst (integer_type_node, tree_to_shwi (@1)); }
+    (convert (min (max (convert:integer_type_node @0) { integer_zero_node; })
+		  { mask; })))))
+
+  /* Truncation via bit_and with mask.  Same concerns on convert? here.  */
+  (simplify
+   (convert
+diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c
+index a984fa560..5c0c9cc49 100755
+--- a/gcc/testsuite/gcc.dg/combine-maxmin.c
+++ b/gcc/testsuite/gcc.dg/combine-maxmin.c
+@@ -52,4 +52,4 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
+ /* { dg-final { scan-assembler-times {[us]min\t} 6 } }  */
+ /* All of the vectorized patterns are expected to be matched.  */
+ /* { dg-final { scan-assembler-not {cmtst\t} } }  */
+-/* { dg-final { scan-assembler-times {uzp1\t} 5 } }  */
+/* { dg-final { scan-assembler-times {uzp1\t} 2 } }  */
+-- 
+2.33.0
+
--- a/0047-AES-Implement-AES-pattern-matching.patch
+++ b/0047-AES-Implement-AES-pattern-matching.patch
--- a/0048-crypto-accel-add-optimization-level-requirement-to-t.patch
+++ b/0048-crypto-accel-add-optimization-level-requirement-to-t.patch
@ -0,0 +1,27 @@
+From 915d549b03c10ab403538888149facd417a02ebc Mon Sep 17 00:00:00 2001
+From: vchernon <chernonog.vyacheslav@huawei.com>
+Date: Wed, 27 Dec 2023 23:31:26 +0800
+Subject: [PATCH 16/18] [crypto-accel] add optimization level requirement to
+ the gate
+
+fix issue (src-openEuler/gcc: I8RRDW)
+---
+ gcc/crypto-accel.cc | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/gcc/crypto-accel.cc b/gcc/crypto-accel.cc
+index f4e810a6b..e7766a585 100644
+--- a/gcc/crypto-accel.cc
+++ b/gcc/crypto-accel.cc
+@@ -2391,7 +2391,7 @@ public:
+   /* opt_pass methods: */
+   virtual bool gate (function *)
+     {
+-      if (flag_crypto_accel_aes <= 0)
+      if (flag_crypto_accel_aes <= 0 || optimize < 1)
+ 	return false;
+       return targetm.get_v16qi_mode
+ 	&& targetm.gen_rev32v16qi
+-- 
+2.33.0
+
--- a/0049-Add-more-flexible-check-for-pointer-aliasing-during-.patch
+++ b/0049-Add-more-flexible-check-for-pointer-aliasing-during-.patch
@ -0,0 +1,239 @@
+From b5865aef36ebaac87ae30d51f08bfe081795ed67 Mon Sep 17 00:00:00 2001
+From: Chernonog Viacheslav <chernonog.vyacheslav@huawei.com>
+Date: Tue, 12 Mar 2024 23:30:56 +0800
+Subject: [PATCH 17/18] Add more flexible check for pointer aliasing during
+ vectorization It takes minimum between number of iteration and segment length
+ it helps to speed up loops with small number of iterations when only tail can
+ be vectorized
+
+---
+ gcc/params.opt                                |  5 ++
+ .../sve/var_stride_flexible_segment_len_1.c   | 23 +++++++
+ gcc/tree-data-ref.cc                          | 67 +++++++++++++------
+ gcc/tree-data-ref.h                           | 11 ++-
+ gcc/tree-vect-data-refs.cc                    | 14 +++-
+ 5 files changed, 95 insertions(+), 25 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
+
+diff --git a/gcc/params.opt b/gcc/params.opt
+index 6176d4790..7e5c119cf 100644
+--- a/gcc/params.opt
+++ b/gcc/params.opt
+@@ -1180,6 +1180,11 @@ Maximum number of loop peels to enhance alignment of data references in a loop.
+ Common Joined UInteger Var(param_vect_max_version_for_alias_checks) Init(10) Param Optimization
+ Bound on number of runtime checks inserted by the vectorizer's loop versioning for alias check.
+ 
+-param=vect-alias-flexible-segment-len=
+Common Joined UInteger Var(param_flexible_seg_len) Init(0) IntegerRange(0, 1) Param Optimization
+Use a minimum length of different segments.  Currenlty the minimum between
+iteration number and vectorization length is chosen by this param.
+
+ -param=vect-max-version-for-alignment-checks=
+ Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization
+ Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check.
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
+new file mode 100644
+index 000000000..894f075f3
+--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
+@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize --param=vect-alias-flexible-segment-len=1" } */
+
+#define TYPE int
+#define SIZE 257
+
+void __attribute__ ((weak))
+f (TYPE *x, TYPE *y, unsigned short n, long m __attribute__((unused)))
+{
+  for (int i = 0; i < SIZE; ++i)
+    x[i * n] += y[i * n];
+}
+
+/* { dg-final { scan-assembler {\tld1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
+/* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
+/* Should use a WAR check that multiplies by (VF-2)*4 rather than
+   an overlap check that multiplies by (257-1)*4.  */
+/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #8\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */
+/* One range check and a check for n being zero.  */
+/* { dg-final { scan-assembler-times {\t(?:cmp|tst)\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */
+diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
+index 397792c35..e6ae9e847 100644
+--- a/gcc/tree-data-ref.cc
+++ b/gcc/tree-data-ref.cc
+@@ -2329,31 +2329,15 @@ create_intersect_range_checks_index (class loop *loop, tree *cond_expr,
+    same arguments.  Try to optimize cases in which the second access
+    is a write and in which some overlap is valid.  */
+ 
+-static bool
+-create_waw_or_war_checks (tree *cond_expr,
+static void
+create_waw_or_war_checks2 (tree *cond_expr, tree seg_len_a,
+ 			  const dr_with_seg_len_pair_t &alias_pair)
+ {
+   const dr_with_seg_len& dr_a = alias_pair.first;
+   const dr_with_seg_len& dr_b = alias_pair.second;
+ 
+-  /* Check for cases in which:
+-
+-     (a) DR_B is always a write;
+-     (b) the accesses are well-ordered in both the original and new code
+-	 (see the comment above the DR_ALIAS_* flags for details); and
+-     (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR.  */
+-  if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
+-    return false;
+-
+-  /* Check for equal (but possibly variable) steps.  */
+   tree step = DR_STEP (dr_a.dr);
+-  if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
+-    return false;
+-
+-  /* Make sure that we can operate on sizetype without loss of precision.  */
+   tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
+-  if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
+-    return false;
+ 
+   /* All addresses involved are known to have a common alignment ALIGN.
+      We can therefore subtract ALIGN from an exclusive endpoint to get
+@@ -2370,9 +2354,6 @@ create_waw_or_war_checks (tree *cond_expr,
+ 			       fold_convert (ssizetype, indicator),
+ 			       ssize_int (0));
+ 
+-  /* Get lengths in sizetype.  */
+-  tree seg_len_a
+-    = fold_convert (sizetype, rewrite_to_non_trapping_overflow (dr_a.seg_len));
+   step = fold_convert (sizetype, rewrite_to_non_trapping_overflow (step));
+ 
+   /* Each access has the following pattern:
+@@ -2479,6 +2460,50 @@ create_waw_or_war_checks (tree *cond_expr,
+   *cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject, limit);
+   if (dump_enabled_p ())
+     dump_printf (MSG_NOTE, "using an address-based WAR/WAW test\n");
+}
+
+/* This is a wrapper function for create_waw_or_war_checks2.  */
+static bool
+create_waw_or_war_checks (tree *cond_expr,
+			  const dr_with_seg_len_pair_t &alias_pair)
+{
+  const dr_with_seg_len& dr_a = alias_pair.first;
+  const dr_with_seg_len& dr_b = alias_pair.second;
+
+  /* Check for cases in which:
+
+     (a) DR_B is always a write;
+     (b) the accesses are well-ordered in both the original and new code
+     (see the comment above the DR_ALIAS_* flags for details); and
+     (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR.  */
+  if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
+    return false;
+
+  /* Check for equal (but possibly variable) steps.  */
+  tree step = DR_STEP (dr_a.dr);
+  if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
+    return false;
+
+  /* Make sure that we can operate on sizetype without loss of precision.  */
+  tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
+  if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
+    return false;
+
+  /* Get lengths in sizetype.  */
+  tree seg_len_a
+    = fold_convert (sizetype,
+		    rewrite_to_non_trapping_overflow (dr_a.seg_len));
+  create_waw_or_war_checks2 (cond_expr, seg_len_a, alias_pair);
+  if (param_flexible_seg_len && dr_a.seg_len != dr_a.seg_len2)
+    {
+      tree seg_len2_a
+	= fold_convert (sizetype,
+			rewrite_to_non_trapping_overflow (dr_a.seg_len2));
+      tree cond_expr2;
+      create_waw_or_war_checks2 (&cond_expr2, seg_len2_a, alias_pair);
+      *cond_expr =  fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
+				 *cond_expr, cond_expr2);
+   }
+   return true;
+ }
+ 
+diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h
+index f643a95b2..9bc5f16ee 100644
+--- a/gcc/tree-data-ref.h
+++ b/gcc/tree-data-ref.h
+@@ -213,12 +213,19 @@ class dr_with_seg_len
+ public:
+   dr_with_seg_len (data_reference_p d, tree len, unsigned HOST_WIDE_INT size,
+ 		   unsigned int a)
+-    : dr (d), seg_len (len), access_size (size), align (a) {}
+-
+    : dr (d), seg_len (len), seg_len2 (len), access_size (size), align (a)
+    {}
+  dr_with_seg_len (data_reference_p d, tree len, tree len2,
+		   unsigned HOST_WIDE_INT size, unsigned int a)
+    : dr (d), seg_len (len), seg_len2 (len2), access_size (size), align (a)
+    {}
+   data_reference_p dr;
+   /* The offset of the last access that needs to be checked minus
+      the offset of the first.  */
+   tree seg_len;
+  /* The second version of segment length.  Currently this is used to
+     soften checks for a small number of iterations.  */
+  tree seg_len2;
+   /* A value that, when added to abs (SEG_LEN), gives the total number of
+      bytes in the segment.  */
+   poly_uint64 access_size;
+diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
+index 4e615b80b..04e68f621 100644
+--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
+@@ -3646,6 +3646,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
+     {
+       poly_uint64 lower_bound;
+       tree segment_length_a, segment_length_b;
+      tree segment_length2_a, segment_length2_b;
+       unsigned HOST_WIDE_INT access_size_a, access_size_b;
+       unsigned int align_a, align_b;
+ 
+@@ -3751,6 +3752,8 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
+ 	{
+ 	  segment_length_a = size_zero_node;
+ 	  segment_length_b = size_zero_node;
+	  segment_length2_a = size_zero_node;
+	  segment_length2_b = size_zero_node;
+ 	}
+       else
+ 	{
+@@ -3759,8 +3762,15 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
+ 	    length_factor = scalar_loop_iters;
+ 	  else
+ 	    length_factor = size_int (vect_factor);
+	  /* In any case we should rememeber scalar_loop_iters
+	     this helps to create flexible aliasing check
+	     for small number of iterations.  */
+ 	  segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
+ 	  segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
+	  segment_length2_a
+	    = vect_vfa_segment_size (dr_info_a, scalar_loop_iters);
+	  segment_length2_b
+	    = vect_vfa_segment_size (dr_info_b, scalar_loop_iters);
+ 	}
+       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
+       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
+@@ -3805,9 +3815,9 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
+ 	}
+ 
+       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
+-			    access_size_a, align_a);
+			    segment_length2_a, access_size_a, align_a);
+       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
+-			    access_size_b, align_b);
+			    segment_length2_b, access_size_b, align_b);
+       /* Canonicalize the order to be the one that's needed for accurate
+ 	 RAW, WAR and WAW flags, in cases where the data references are
+ 	 well-ordered.  The order doesn't really matter otherwise,
+-- 
+2.33.0
+
--- a/0050-Port-IPA-prefetch-to-GCC-12.patch
+++ b/0050-Port-IPA-prefetch-to-GCC-12.patch
--- a/0051-Port-fixes-for-IPA-prefetch-to-GCC-12.patch
+++ b/0051-Port-fixes-for-IPA-prefetch-to-GCC-12.patch
--- a/0052-Fix-fails-in-IPA-prefetch-src-openEuler-gcc-I96ID7.patch
+++ b/0052-Fix-fails-in-IPA-prefetch-src-openEuler-gcc-I96ID7.patch
@ -0,0 +1,94 @@
+From 0263daa1312d0cdcdf9c770bcf5d982a2d4fc16b Mon Sep 17 00:00:00 2001
+From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
+Date: Fri, 29 Mar 2024 17:15:41 +0800
+Subject: [PATCH 2/2] Fix fails in IPA prefetch (src-openEuler/gcc: I96ID7)
+
+---
+ gcc/ipa-prefetch.cc | 28 ++++++++++++++++++++++++++--
+ 1 file changed, 26 insertions(+), 2 deletions(-)
+
+diff --git a/gcc/ipa-prefetch.cc b/gcc/ipa-prefetch.cc
+index 9537e4835..1ceb5137f 100644
+--- a/gcc/ipa-prefetch.cc
+++ b/gcc/ipa-prefetch.cc
+@@ -366,6 +366,7 @@ typedef std::map<memref_t *, memref_t *> memref_map;
+ typedef std::map<memref_t *, tree> memref_tree_map;
+ 
+ typedef std::set<gimple *> stmt_set;
+typedef std::set<tree> tree_set;
+ typedef std::map<tree, tree> tree_map;
+ 
+ tree_memref_map *tm_map;
+@@ -1124,8 +1125,21 @@ analyse_loops ()
+     }
+ }
+ 
+/* Compare memrefs by IDs; helper for qsort.  */
+
+static int
+memref_id_cmp (const void *p1, const void *p2)
+{
+  const memref_t *mr1 = *(const memref_t **) p1;
+  const memref_t *mr2 = *(const memref_t **) p2;
+
+  if ((unsigned) mr1->mr_id > (unsigned) mr2->mr_id)
+    return 1;
+  return -1;
+}
+
+ /* Reduce the set filtering out memrefs with the same memory references,
+-   return the result vector of memrefs.  */
+   sort and return the result vector of memrefs.  */
+ 
+ static void
+ reduce_memref_set (memref_set *set, vec<memref_t *> &vec)
+@@ -1162,6 +1176,7 @@ reduce_memref_set (memref_set *set, vec<memref_t *> &vec)
+ 	    vec.safe_push (mr1);
+ 	}
+     }
+  vec.qsort (memref_id_cmp);
+   if (dump_file)
+     {
+       fprintf (dump_file, "MRs (%d) after filtering: ", vec.length ());
+@@ -1663,10 +1678,15 @@ optimize_function (cgraph_node *n, function *fn)
+     }
+ 
+   /* Create other new vars.  Insert new stmts.  */
+  vec<memref_t *> used_mr_vec = vNULL;
+   for (memref_set::const_iterator it = used_mrs.begin ();
+        it != used_mrs.end (); it++)
+    used_mr_vec.safe_push (*it);
+  used_mr_vec.qsort (memref_id_cmp);
+
+  for (unsigned int j = 0; j < used_mr_vec.length (); j++)
+     {
+-      memref_t *mr = *it;
+      memref_t *mr = used_mr_vec[j];
+       if (mr == comp_mr)
+ 	continue;
+       gimple *last_stmt = gimple_copy_and_remap_memref_stmts (mr, stmts, 0,
+@@ -1702,6 +1722,7 @@ optimize_function (cgraph_node *n, function *fn)
+       local = integer_three_node;
+       break;
+     }
+  tree_set prefetched_addrs;
+   for (unsigned int j = 0; j < vmrs.length (); j++)
+     {
+       memref_t *mr = vmrs[j];
+@@ -1714,10 +1735,13 @@ optimize_function (cgraph_node *n, function *fn)
+       tree addr = get_mem_ref_address_ssa_name (mr->mem, NULL_TREE);
+       if (decl_map->count (addr))
+ 	addr = (*decl_map)[addr];
+      if (prefetched_addrs.count (addr))
+	continue;
+       last_stmt = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
+ 				     3, addr, write_p, local);
+       pcalls.safe_push (last_stmt);
+       gimple_seq_add_stmt (&stmts, last_stmt);
+      prefetched_addrs.insert (addr);
+       if (dump_file)
+ 	{
+ 	  fprintf (dump_file, "Insert %d prefetch stmt:\n", j);
+-- 
+2.33.0
+
--- a/0053-struct-reorg-Add-Semi-Relayout.patch
+++ b/0053-struct-reorg-Add-Semi-Relayout.patch
--- a/0054-Struct-Reorg-Bugfix-for-structure-pointer-compressio.patch
+++ b/0054-Struct-Reorg-Bugfix-for-structure-pointer-compressio.patch
@ -0,0 +1,28 @@
+From 9dc3df938b9ed2c27498c8548087fee1ce930366 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?=E9=83=91=E6=99=A8=E5=8D=89?= <zhengchenhui1@huawei.com>
+Date: Tue, 2 Apr 2024 11:08:30 +0800
+Subject: [PATCH] [Struct Reorg] Bugfix for structure pointer compression
+
+---
+ gcc/ipa-struct-reorg/ipa-struct-reorg.cc | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
+index fa33f2d35..3922873f3 100644
+--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
+++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
+@@ -7541,9 +7541,11 @@ ipa_struct_reorg::check_and_prune_struct_for_pointer_compression (void)
+       if (!type->has_legal_alloc_num)
+ 	{
+ 	  if (current_layout_opt_level & POINTER_COMPRESSION_UNSAFE)
+	    {
+ 	    if (dump_file)
+ 	      fprintf (dump_file, " has unknown alloc size, but"
+ 				  " in unsafe mode, so");
+	    }
+ 	  else
+ 	    {
+ 	      if (dump_file)
+-- 
+2.33.0
+
--- a/gcc.spec
+++ b/gcc.spec
@ -2,7 +2,7 @@
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 20
+%global gcc_release 21

 %global _unpackaged_files_terminate_build 0
 %global _performance_build 1
@ -172,6 +172,26 @@ Patch31: 0031-AutoBOLT-Support-saving-feedback-count-info-to-ELF-s.patch
 Patch32: 0032-AutoBOLT-Add-bolt-linker-plugin-2-3.patch
 Patch33: 0033-AutoBOLT-Enable-BOLT-linker-plugin-on-aarch64-3-3.patch
 Patch34: 0034-Autofdo-Enable-discrimibator-and-MCF-algorithm-on-Au.patch
+Patch35: 0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch
+Patch36: 0036-rtl-ifcvt-introduce-rtl-ifcvt-enchancements.patch           
+Patch37: 0037-Perform-early-if-conversion-of-simple-arithmetic.patch      
+Patch38: 0038-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch  
+Patch39: 0039-Match-double-sized-mul-pattern.patch                        
+Patch40: 0040-Port-icp-patch-to-GCC-12.patch                              
+Patch41: 0041-Port-fixes-in-icp-to-GCC-12.patch
+Patch42: 0042-Add-split-complex-instructions-pass.patch                   
+Patch43: 0043-Extending-and-refactoring-of-pass_split_complex_inst.patch
+Patch44: 0044-Port-maxmin-patch-to-GCC-12.patch
+Patch45: 0045-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch
+Patch46: 0046-Add-new-pattern-to-pass-the-maxmin-tests.patch
+Patch47: 0047-AES-Implement-AES-pattern-matching.patch
+Patch48: 0048-crypto-accel-add-optimization-level-requirement-to-t.patch
+Patch49: 0049-Add-more-flexible-check-for-pointer-aliasing-during-.patch
+Patch50: 0050-Port-IPA-prefetch-to-GCC-12.patch
+Patch51: 0051-Port-fixes-for-IPA-prefetch-to-GCC-12.patch
+Patch52: 0052-Fix-fails-in-IPA-prefetch-src-openEuler-gcc-I96ID7.patch
+Patch53: 0053-struct-reorg-Add-Semi-Relayout.patch
+Patch54: 0054-Struct-Reorg-Bugfix-for-structure-pointer-compressio.patch

 # Part 3000 ~ 4999
 %ifarch loongarch64
@ -801,6 +821,26 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch32 -p1
 %patch33 -p1
 %patch34 -p1
+%patch35 -p1
+%patch36 -p1
+%patch37 -p1
+%patch38 -p1
+%patch39 -p1
+%patch40 -p1
+%patch41 -p1
+%patch42 -p1
+%patch43 -p1
+%patch44 -p1
+%patch45 -p1
+%patch46 -p1
+%patch47 -p1
+%patch48 -p1
+%patch49 -p1
+%patch50 -p1
+%patch51 -p1
+%patch52 -p1
+%patch53 -p1
+%patch54 -p1

 %ifarch loongarch64
 %patch3001 -p1
@ -3186,6 +3226,10 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*

 %changelog
+* Thu Apr 11 2024 Zhengchen Hui <zhengchenhui1@huawei.com> - 12.3.1-21
+- Type: Sync
+- DESC: Sync patch from openeuler/gcc
+
 * Thu Apr 11 2024 Zhenyu Zhao <zhaozhenyu17@huawei.com> - 12.3.1-20
 - Type: Sync
 - DESC: Sync patch from openeuler/gcc