!408 [Sync] Sync patche from openeuler/gcc.

From: @lesleyzheng1103 Reviewed-by: @huang-xiaoquan Signed-off-by: @huang-xiaoquan
2024-04-11 12:54:56 +00:00 · 2024-04-11 12:54:56 +00:00 · bbbef3225a
commit bbbef3225a
parent 5db544f251 3ee8545fc2
21 changed files with 17497 additions and 1 deletions
--- a/0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch
+++ b/0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch
@ -0,0 +1,194 @@
 From aa39a66f6029fe16a656d7c6339908b953fb1e04 Mon Sep 17 00:00:00 2001
 From: Diachkov Ilia WX1215920 <diachkov.ilia1@huawei-partners.com>
 Date: Thu, 22 Feb 2024 11:27:43 +0300
 Subject: [PATCH 01/18] Add insn defs and correct costs for cmlt generation
 ---
 gcc/config/aarch64/aarch64-simd.md  | 48 +++++++++++++++++++++++++++++
 gcc/config/aarch64/aarch64.cc       | 15 +++++++++
 gcc/config/aarch64/aarch64.opt      |  4 +++
 gcc/config/aarch64/iterators.md     |  3 +-
 gcc/config/aarch64/predicates.md    | 25 +++++++++++++++
 gcc/testsuite/gcc.dg/combine-cmlt.c | 20 ++++++++++++
 6 files changed, 114 insertions(+), 1 deletion(-)
 create mode 100755 gcc/testsuite/gcc.dg/combine-cmlt.c
 diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
 index ee7f0b89c..82f73805f 100644
 --- a/gcc/config/aarch64/aarch64-simd.md
 +++ b/gcc/config/aarch64/aarch64-simd.md
@@ -6454,6 +6454,54 @@
   [(set_attr "type" "neon_compare<q>, neon_compare_zero<q>")]
 )
 +;; Use cmlt to replace vector arithmetic operations like this (SImode example):
 +;; B = (((A >> 15) & 0x00010001) << 16) - ((A >> 15) & 0x00010001)
 +;; TODO: maybe extend to scalar operations or other cm** instructions.
 +
 +(define_insn "*aarch64_cmlt_as_arith<mode>"
 +  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
 +	(minus:<V_INT_EQUIV>
 +	  (ashift:<V_INT_EQUIV>
 +	    (and:<V_INT_EQUIV>
 +	      (lshiftrt:<V_INT_EQUIV>
 +		(match_operand:VDQHSD 1 "register_operand" "w")
 +		(match_operand:VDQHSD 2 "half_size_minus_one_operand"))
 +	      (match_operand:VDQHSD 3 "cmlt_arith_mask_operand"))
 +	    (match_operand:VDQHSD 4 "half_size_operand"))
 +	  (and:<V_INT_EQUIV>
 +	    (lshiftrt:<V_INT_EQUIV>
 +	      (match_dup 1)
 +	      (match_dup 2))
 +	    (match_dup 3))))]
 +  "TARGET_SIMD && flag_cmlt_arith"
 +  "cmlt\t%<v>0.<V2ntype>, %<v>1.<V2ntype>, #0"
 +  [(set_attr "type" "neon_compare_zero")]
 +)
 +
 +;; The helper definition that allows combiner to use the previous pattern.
 +
 +(define_insn_and_split "*arch64_cmlt_tmp<mode>"
 +  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
 +	(and:<V_INT_EQUIV>
 +	  (lshiftrt:<V_INT_EQUIV>
 +	    (match_operand:VDQHSD 1 "register_operand" "w")
 +	    (match_operand:VDQHSD 2 "half_size_minus_one_operand"))
 +	  (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
 +  "TARGET_SIMD && flag_cmlt_arith"
 +  "#"
 +  "&& reload_completed"
 +  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
 +	(lshiftrt:<V_INT_EQUIV>
 +	  (match_operand:VDQHSD 1 "register_operand")
 +	  (match_operand:VDQHSD 2 "half_size_minus_one_operand")))
 +   (set (match_dup 0)
 +	(and:<V_INT_EQUIV>
 +	  (match_dup 0)
 +	  (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
 +  ""
 +  [(set_attr "type" "neon_compare_zero")]
 +)
 +
 (define_insn_and_split "aarch64_cm<optab>di"
   [(set (match_operand:DI 0 "register_operand" "=w,w,r")
 	(neg:DI
 diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
 index a3da4ca30..04072ca25 100644
 --- a/gcc/config/aarch64/aarch64.cc
 +++ b/gcc/config/aarch64/aarch64.cc
@@ -14064,6 +14064,21 @@ cost_minus:
 	    return true;
 	  }
 +	/* Detect aarch64_cmlt_as_arith instruction. Now only this pattern
 +	   matches the condition. The costs of cmlt and sub instructions
 +	   are comparable, so we are not increasing the cost here.  */
 +	if (flag_cmlt_arith && GET_CODE (op0) == ASHIFT
 +	    && GET_CODE (op1) == AND)
 +	  {
 +	    rtx op0_subop0 = XEXP (op0, 0);
 +	    if (rtx_equal_p (op0_subop0, op1))
 +	      {
 +		rtx lshrt_op = XEXP (op0_subop0, 0);
 +		if (GET_CODE (lshrt_op) == LSHIFTRT)
 +		  return true;
 +	      }
 +	  }
 +
 	/* Look for SUB (extended register).  */
 	if (is_a <scalar_int_mode> (mode)
 	    && aarch64_rtx_arith_op_extract_p (op1))
 diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
 index a64b927e9..101664c7c 100644
 --- a/gcc/config/aarch64/aarch64.opt
 +++ b/gcc/config/aarch64/aarch64.opt
@@ -262,6 +262,10 @@ Use an immediate to offset from the stack protector guard register, sp_el0.
 This option is for use with fstack-protector-strong and not for use in
 user-land code.
 +mcmlt-arith
 +Target Var(flag_cmlt_arith) Optimization Init(0)
 +Use SIMD cmlt instruction to perform some arithmetic/logic calculations.
 +
 TargetVariable
 long aarch64_stack_protector_guard_offset = 0
 diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
 index 26a840d7f..967e6b0b1 100644
 --- a/gcc/config/aarch64/iterators.md
 +++ b/gcc/config/aarch64/iterators.md
@@ -1485,7 +1485,8 @@
 			  (V2DI "2s")])
 ;; Register suffix narrowed modes for VQN.
 -(define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h")
 +(define_mode_attr V2ntype [(V4HI "8b") (V2SI "4h")
 +			   (V8HI "16b") (V4SI "8h")
 			   (V2DI "4s")])
 ;; Widened modes of vector modes.
 diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
 index c308015ac..07c14aacb 100644
 --- a/gcc/config/aarch64/predicates.md
 +++ b/gcc/config/aarch64/predicates.md
@@ -49,6 +49,31 @@
   return CONST_INT_P (op) && IN_RANGE (INTVAL (op), 1, 3);
 })
 +(define_predicate "half_size_minus_one_operand"
 +  (match_code "const_vector")
 +{
 +  op = unwrap_const_vec_duplicate (op);
 +  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
 +  return CONST_INT_P (op) && (UINTVAL (op) == size - 1);
 +})
 +
 +(define_predicate "half_size_operand"
 +  (match_code "const_vector")
 +{
 +  op = unwrap_const_vec_duplicate (op);
 +  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
 +  return CONST_INT_P (op) && (UINTVAL (op) == size);
 +})
 +
 +(define_predicate "cmlt_arith_mask_operand"
 +  (match_code "const_vector")
 +{
 +  op = unwrap_const_vec_duplicate (op);
 +  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
 +  unsigned long long mask = ((unsigned long long) 1 << size) | 1;
 +  return CONST_INT_P (op) && (UINTVAL (op) == mask);
 +})
 +
 (define_predicate "subreg_lowpart_operator"
   (ior (match_code "truncate")
        (and (match_code "subreg")
 diff --git a/gcc/testsuite/gcc.dg/combine-cmlt.c b/gcc/testsuite/gcc.dg/combine-cmlt.c
 new file mode 100755
 index 000000000..b4c9a37ff
 --- /dev/null
 +++ b/gcc/testsuite/gcc.dg/combine-cmlt.c
@@ -0,0 +1,20 @@
 +/* { dg-do compile { target aarch64-*-* } } */
 +/* { dg-options "-O3 -mcmlt-arith" } */
 +
 +/* The test checks usage of cmlt insns for arithmetic/logic calculations
 + * in foo ().  It's inspired by sources of x264 codec.  */
 +
 +typedef unsigned short int uint16_t;
 +typedef unsigned int uint32_t;
 +
 +void foo( uint32_t *a, uint32_t *b)
 +{
 +  for (unsigned i = 0; i < 4; i++)
 +    {
 +      uint32_t s = ((a[i]>>((8 * sizeof(uint16_t))-1))
 +		    &(((uint32_t)1<<(8 * sizeof(uint16_t)))+1))*((uint16_t)-1);
 +      b[i] = (a[i]+s)^s;
 +    }
 +}
 +
 +/* { dg-final { scan-assembler-times {cmlt\t} 1 } }  */
 -- 
 2.33.0
--- a/0036-rtl-ifcvt-introduce-rtl-ifcvt-enchancements.patch
+++ b/0036-rtl-ifcvt-introduce-rtl-ifcvt-enchancements.patch
@ -0,0 +1,560 @@
 From 4cae948c1c00ad7a59f0f234f809fbd9a0208eb4 Mon Sep 17 00:00:00 2001
 From: vchernon <chernonog.vyacheslav@huawei.com>
 Date: Wed, 28 Feb 2024 23:05:12 +0800
 Subject: [PATCH 02/18] [rtl-ifcvt] introduce rtl ifcvt enchancements     new
 option:       -fifcvt-allow-complicated-cmps:         allows ifcvt to deal
 with complicated cmps like
        cmp reg1 (reg2 + reg3)
        can increase compilation time
    new param:
      -param=ifcvt-allow-register-renaming=[0,1,2]
        1 : allows ifcvt to rename registers in then and else bb
        2 : allows to rename registers in condition and else/then bb
        can increase compilation time and register pressure
 ---
 gcc/common.opt                                |   4 +
 gcc/ifcvt.cc                                  | 291 +++++++++++++++---
 gcc/params.opt                                |   4 +
 .../gcc.c-torture/execute/ifcvt-renaming-1.c  |  35 +++
 gcc/testsuite/gcc.dg/ifcvt-6.c                |  27 ++
 5 files changed, 311 insertions(+), 50 deletions(-)
 create mode 100644 gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c
 create mode 100644 gcc/testsuite/gcc.dg/ifcvt-6.c
 diff --git a/gcc/common.opt b/gcc/common.opt
 index c7c6bc256..aa00fb7b0 100644
 --- a/gcc/common.opt
 +++ b/gcc/common.opt
@@ -3691,4 +3691,8 @@ fipa-ra
 Common Var(flag_ipa_ra) Optimization
 Use caller save register across calls if possible.
 +fifcvt-allow-complicated-cmps
 +Common Var(flag_ifcvt_allow_complicated_cmps) Optimization
 +Allow RTL if-conversion pass to deal with complicated cmps (can increase compilation time).
 +
 ; This comment is to ensure we retain the blank line above.
 diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc
 index 2c1eba312..584db7b55 100644
 --- a/gcc/ifcvt.cc
 +++ b/gcc/ifcvt.cc
@@ -886,7 +886,9 @@ noce_emit_store_flag (struct noce_if_info *if_info, rtx x, int reversep,
     }
   /* Don't even try if the comparison operands or the mode of X are weird.  */
 -  if (cond_complex || !SCALAR_INT_MODE_P (GET_MODE (x)))
 +  if (!flag_ifcvt_allow_complicated_cmps
 +      && (cond_complex
 +	  || !SCALAR_INT_MODE_P (GET_MODE (x))))
     return NULL_RTX;
   return emit_store_flag (x, code, XEXP (cond, 0),
@@ -1965,7 +1967,8 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc)
   /* Currently support only simple single sets in test_bb.  */
   if (!sset
       || !noce_operand_ok (SET_DEST (sset))
 -      || contains_ccmode_rtx_p (SET_DEST (sset))
 +      || (!flag_ifcvt_allow_complicated_cmps
 +	  && contains_ccmode_rtx_p (SET_DEST (sset)))
       || !noce_operand_ok (SET_SRC (sset)))
     return false;
@@ -1979,13 +1982,17 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc)
    in this function.  */
 static bool
 -bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
 +bbs_ok_for_cmove_arith (basic_block bb_a,
 +			basic_block bb_b,
 +			rtx to_rename,
 +			bitmap conflict_regs)
 {
   rtx_insn *a_insn;
   bitmap bba_sets = BITMAP_ALLOC (&reg_obstack);
 -
 +  bitmap intersections = BITMAP_ALLOC (&reg_obstack);
   df_ref def;
   df_ref use;
 +  rtx_insn *last_a = last_active_insn (bb_a, FALSE);
   FOR_BB_INSNS (bb_a, a_insn)
     {
@@ -1995,18 +2002,15 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
       rtx sset_a = single_set (a_insn);
       if (!sset_a)
 -	{
 -	  BITMAP_FREE (bba_sets);
 -	  return false;
 -	}
 +	goto end_cmove_arith_check_and_fail;
       /* Record all registers that BB_A sets.  */
       FOR_EACH_INSN_DEF (def, a_insn)
 -	if (!(to_rename && DF_REF_REG (def) == to_rename))
 +	if (!(to_rename && DF_REF_REG (def) == to_rename && a_insn == last_a))
 	  bitmap_set_bit (bba_sets, DF_REF_REGNO (def));
     }
 +  bitmap_and (intersections, df_get_live_in (bb_b), bba_sets);
   rtx_insn *b_insn;
 -
   FOR_BB_INSNS (bb_b, b_insn)
     {
       if (!active_insn_p (b_insn))
@@ -2015,10 +2019,7 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
       rtx sset_b = single_set (b_insn);
       if (!sset_b)
 -	{
 -	  BITMAP_FREE (bba_sets);
 -	  return false;
 -	}
 +	goto end_cmove_arith_check_and_fail;
       /* Make sure this is a REG and not some instance
 	 of ZERO_EXTRACT or SUBREG or other dangerous stuff.
@@ -2030,25 +2031,34 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
       if (MEM_P (SET_DEST (sset_b)))
 	gcc_assert (rtx_equal_p (SET_DEST (sset_b), to_rename));
       else if (!REG_P (SET_DEST (sset_b)))
 -	{
 -	  BITMAP_FREE (bba_sets);
 -	  return false;
 -	}
 +	goto end_cmove_arith_check_and_fail;
 -      /* If the insn uses a reg set in BB_A return false.  */
 +      /* If the insn uses a reg set in BB_A return false
 +	 or try to collect register list for renaming.  */
       FOR_EACH_INSN_USE (use, b_insn)
 	{
 -	  if (bitmap_bit_p (bba_sets, DF_REF_REGNO (use)))
 +	  if (bitmap_bit_p (intersections, DF_REF_REGNO (use)))
 	    {
 -	      BITMAP_FREE (bba_sets);
 -	      return false;
 +	      if (param_ifcvt_allow_register_renaming < 1)
 +		  goto end_cmove_arith_check_and_fail;
 +
 +	      /* Those regs should be renamed.  We can't rename CC reg, but
 +		 possibly we can provide combined comparison in the future.  */
 +	      if (GET_MODE_CLASS (GET_MODE (DF_REF_REG (use))) == MODE_CC)
 +		goto end_cmove_arith_check_and_fail;
 +	      bitmap_set_bit (conflict_regs, DF_REF_REGNO (use));
 	    }
 	}
 -
     }
   BITMAP_FREE (bba_sets);
 +  BITMAP_FREE (intersections);
   return true;
 +
 +end_cmove_arith_check_and_fail:
 +  BITMAP_FREE (bba_sets);
 +  BITMAP_FREE (intersections);
 +  return false;
 }
 /* Emit copies of all the active instructions in BB except the last.
@@ -2103,6 +2113,142 @@ noce_emit_bb (rtx last_insn, basic_block bb, bool simple)
   return true;
 }
 +/* This function tries to rename regs that intersect with considered bb
 +   inside condition expression.  Condition expression will be moved down
 +   if the optimization will be applied, so it is essential to be sure that
 +   all intersected registers will be renamed otherwise transformation
 +   can't be applied.  Function returns true if renaming was successful
 +   and optimization can proceed futher.  */
 +
 +static bool
 +noce_rename_regs_in_cond (struct noce_if_info *if_info, bitmap cond_rename_regs)
 +{
 +  bool success = true;
 +  if (bitmap_empty_p (cond_rename_regs))
 +    return true;
 +  if (param_ifcvt_allow_register_renaming < 2)
 +    return false;
 +  df_ref use;
 +  rtx_insn *cmp_insn = if_info->cond_earliest;
 +  /*  Jump instruction as a condion currently unsupported.  */
 +  if (JUMP_P (cmp_insn))
 +    return false;
 +  rtx_insn *before_cmp = PREV_INSN (cmp_insn);
 +  start_sequence ();
 +  rtx_insn *copy_of_cmp = as_a <rtx_insn *> (copy_rtx (cmp_insn));
 +  basic_block cmp_block = BLOCK_FOR_INSN (cmp_insn);
 +  FOR_EACH_INSN_USE (use, cmp_insn)
 +    {
 +      if (bitmap_bit_p (cond_rename_regs, DF_REF_REGNO (use)))
 +	{
 +	  rtx use_reg = DF_REF_REG (use);
 +	  rtx tmp = gen_reg_rtx (GET_MODE (use_reg));
 +	  if (!validate_replace_rtx (use_reg, tmp, copy_of_cmp))
 +	    {
 +	      end_sequence ();
 +	      return false;
 +	    }
 +	  noce_emit_move_insn (tmp, use_reg);
 +	}
 +    }
 +
 +  emit_insn (PATTERN (copy_of_cmp));
 +  rtx_insn *seq = get_insns ();
 +  unshare_all_rtl_in_chain (seq);
 +  end_sequence ();
 +
 +  emit_insn_after_setloc (seq, before_cmp, INSN_LOCATION (cmp_insn));
 +  delete_insn_and_edges (cmp_insn);
 +  rtx_insn *insn;
 +  FOR_BB_INSNS (cmp_block, insn)
 +    df_insn_rescan (insn);
 +
 +  if_info->cond = noce_get_condition (if_info->jump,
 +				      &copy_of_cmp,
 +				      if_info->then_else_reversed);
 +  if_info->cond_earliest = copy_of_cmp;
 +  if_info->rev_cond = NULL_RTX;
 +
 +  return success;
 +}
 +
 +/* This function tries to rename regs that intersect with considered bb.
 +   return true if the renaming was successful and optimization can
 +   proceed futher, false otherwise.  */
 +static bool
 +noce_rename_regs_in_bb (basic_block test_bb, bitmap rename_regs)
 +{
 +  if (bitmap_empty_p (rename_regs))
 +    return true;
 +  rtx_insn *insn;
 +  rtx_insn *last_insn = last_active_insn (test_bb, FALSE);
 +  bool res = true;
 +  start_sequence ();
 +  FOR_BB_INSNS (test_bb, insn)
 +    {
 +      if (!active_insn_p (insn))
 +	continue;
 +      /* Only ssets are supported for now.  */
 +      rtx sset = single_set (insn);
 +      gcc_assert (sset);
 +      rtx x = SET_DEST (sset);
 +      if (!REG_P (x) || !bitmap_bit_p (rename_regs, REGNO (x)))
 +	continue;
 +      /* Do not need to rename dest in the last instruction
 +	 it will be renamed anyway.  */
 +      if (insn == last_insn)
 +	continue;
 +      machine_mode mode = GET_MODE (x);
 +      rtx tmp = gen_reg_rtx (mode);
 +      if (!validate_replace_rtx_part (x, tmp, &SET_DEST (sset), insn))
 +	{
 +	  gcc_assert (insn != last_insn);
 +	  /* We can generate additional move for such case,
 +	     but it will increase register preasure.
 +	     For now just stop transformation.  */
 +	  rtx result_rtx = SET_DEST (single_set (last_insn));
 +	  if (REG_P (result_rtx) && (x != result_rtx))
 +	    {
 +	      res = false;
 +	      break;
 +	    }
 +	  if (!validate_replace_rtx (x, tmp, insn))
 +	    gcc_unreachable ();
 +	  noce_emit_move_insn (tmp,x);
 +	}
 +      set_used_flags (insn);
 +      rtx_insn *rename_candidate;
 +      for (rename_candidate = NEXT_INSN (insn);
 +	   rename_candidate && rename_candidate!= NEXT_INSN (BB_END (test_bb));
 +	   rename_candidate = NEXT_INSN (rename_candidate))
 +	{
 +	  if (!reg_overlap_mentioned_p (x, rename_candidate))
 +	    continue;
 +
 +	  int replace_res = TRUE;
 +	  if (rename_candidate == last_insn)
 +	    {
 +	      validate_replace_src_group (x, tmp, rename_candidate);
 +	      replace_res = apply_change_group ();
 +	    }
 +	  else
 +	    replace_res = validate_replace_rtx (x, tmp, rename_candidate);
 +	  gcc_assert (replace_res);
 +	  set_used_flags (rename_candidate);
 +	}
 +      set_used_flags (x);
 +      set_used_flags (tmp);
 +    }
 +    rtx_insn *seq = get_insns ();
 +    unshare_all_rtl_in_chain (seq);
 +    end_sequence ();
 +    emit_insn_before_setloc (seq, first_active_insn (test_bb),
 +			     INSN_LOCATION (first_active_insn (test_bb)));
 +    FOR_BB_INSNS (test_bb, insn)
 +      df_insn_rescan (insn);
 +  return res;
 +}
 +
 /* Try more complex cases involving conditional_move.  */
 static int
@@ -2185,11 +2331,30 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
 	  std::swap (then_bb, else_bb);
 	}
     }
 -
 +  bitmap else_bb_rename_regs = BITMAP_ALLOC (&reg_obstack);
 +  bitmap then_bb_rename_regs = BITMAP_ALLOC (&reg_obstack);
   if (then_bb && else_bb
 -      && (!bbs_ok_for_cmove_arith (then_bb, else_bb,  if_info->orig_x)
 -	  || !bbs_ok_for_cmove_arith (else_bb, then_bb,  if_info->orig_x)))
 -    return FALSE;
 +      && (!bbs_ok_for_cmove_arith (then_bb, else_bb,
 +				   if_info->orig_x,
 +				   then_bb_rename_regs)
 +	  || !bbs_ok_for_cmove_arith (else_bb, then_bb,
 +				      if_info->orig_x,
 +				      else_bb_rename_regs)))
 +    {
 +      BITMAP_FREE (then_bb_rename_regs);
 +      BITMAP_FREE (else_bb_rename_regs);
 +      return FALSE;
 +    }
 +  bool prepass_renaming = noce_rename_regs_in_bb (then_bb,
 +						  then_bb_rename_regs)
 +			  && noce_rename_regs_in_bb (else_bb,
 +						     else_bb_rename_regs);
 +
 +  BITMAP_FREE (then_bb_rename_regs);
 +  BITMAP_FREE (else_bb_rename_regs);
 +
 +  if (!prepass_renaming)
 +   return FALSE;
   start_sequence ();
@@ -3072,7 +3237,8 @@ noce_operand_ok (const_rtx op)
 static bool
 bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
 -			      unsigned int *cost, bool *simple_p)
 +			     unsigned int *cost, bool *simple_p,
 +			     bitmap cond_rename_regs)
 {
   if (!test_bb)
     return false;
@@ -3112,8 +3278,9 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
   rtx_insn *prev_last_insn = PREV_INSN (last_insn);
   gcc_assert (prev_last_insn);
 -  /* For now, disallow setting x multiple times in test_bb.  */
 -  if (REG_P (x) && reg_set_between_p (x, first_insn, prev_last_insn))
 +  if (REG_P (x)
 +      && reg_set_between_p (x, first_insn, prev_last_insn)
 +      && param_ifcvt_allow_register_renaming < 1)
     return false;
   bitmap test_bb_temps = BITMAP_ALLOC (&reg_obstack);
@@ -3125,25 +3292,35 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
   rtx_insn *insn;
   FOR_BB_INSNS (test_bb, insn)
     {
 -      if (insn != last_insn)
 -	{
 -	  if (!active_insn_p (insn))
 -	    continue;
 +      if (insn == last_insn)
 +	continue;
 +      if (!active_insn_p (insn))
 +	continue;
 -	  if (!insn_valid_noce_process_p (insn, cc))
 -	    goto free_bitmap_and_fail;
 +      if (!insn_valid_noce_process_p (insn, cc))
 +	goto free_bitmap_and_fail;
 -	  rtx sset = single_set (insn);
 -	  gcc_assert (sset);
 +      rtx sset = single_set (insn);
 +      gcc_assert (sset);
 -	  if (contains_mem_rtx_p (SET_SRC (sset))
 -	      || !REG_P (SET_DEST (sset))
 -	      || reg_overlap_mentioned_p (SET_DEST (sset), cond))
 -	    goto free_bitmap_and_fail;
 +      if (contains_mem_rtx_p (SET_SRC (sset))
 +	  || !REG_P (SET_DEST (sset)))
 +	goto free_bitmap_and_fail;
 -	  potential_cost += pattern_cost (sset, speed_p);
 -	  bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset)));
 +      if (reg_overlap_mentioned_p (SET_DEST (sset), cond))
 +	{
 +	  if (param_ifcvt_allow_register_renaming < 1)
 +	    goto free_bitmap_and_fail;
 +	  rtx sset_dest = SET_DEST (sset);
 +	  if (REG_P (sset_dest)
 +	      && (GET_MODE_CLASS (GET_MODE (sset_dest)) != MODE_CC))
 +	    bitmap_set_bit (cond_rename_regs, REGNO (sset_dest));
 +	  else
 +	    goto free_bitmap_and_fail;
 	}
 +	potential_cost += pattern_cost (sset, speed_p);
 +	if (SET_DEST (sset) != SET_DEST (last_set))
 +	  bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset)));
     }
   /* If any of the intermediate results in test_bb are live after test_bb
@@ -3777,15 +3954,29 @@ noce_process_if_block (struct noce_if_info *if_info)
   bool speed_p = optimize_bb_for_speed_p (test_bb);
   unsigned int then_cost = 0, else_cost = 0;
 +  bitmap cond_rename_regs = BITMAP_ALLOC (&reg_obstack);
   if (!bb_valid_for_noce_process_p (then_bb, cond, &then_cost,
 -				    &if_info->then_simple))
 -    return false;
 +				    &if_info->then_simple, cond_rename_regs))
 +    {
 +      BITMAP_FREE (cond_rename_regs);
 +      return false;
 +    }
   if (else_bb
       && !bb_valid_for_noce_process_p (else_bb, cond, &else_cost,
 -				       &if_info->else_simple))
 -    return false;
 +				       &if_info->else_simple, cond_rename_regs))
 +    {
 +      BITMAP_FREE (cond_rename_regs);
 +      return false;
 +    }
 +  if (!noce_rename_regs_in_cond (if_info, cond_rename_regs))
 +    {
 +      BITMAP_FREE (cond_rename_regs);
 +      return false;
 +    }
 +  BITMAP_FREE (cond_rename_regs);
 +  cond = if_info->cond;
   if (speed_p)
     if_info->original_cost += average_cost (then_cost, else_cost,
 					    find_edge (test_bb, then_bb));
@@ -5823,12 +6014,13 @@ if_convert (bool after_combine)
 {
   basic_block bb;
   int pass;
 -
   if (optimize == 1)
     {
       df_live_add_problem ();
       df_live_set_all_dirty ();
     }
 +  free_dominance_info (CDI_DOMINATORS);
 +  cleanup_cfg (CLEANUP_EXPENSIVE);
   /* Record whether we are after combine pass.  */
   ifcvt_after_combine = after_combine;
@@ -5933,7 +6125,6 @@ rest_of_handle_if_conversion (void)
 	  dump_reg_info (dump_file);
 	  dump_flow_info (dump_file, dump_flags);
 	}
 -      cleanup_cfg (CLEANUP_EXPENSIVE);
       if_convert (false);
       if (num_updated_if_blocks)
 	/* Get rid of any dead CC-related instructions.  */
 diff --git a/gcc/params.opt b/gcc/params.opt
 index d2196dc68..ba87f820b 100644
 --- a/gcc/params.opt
 +++ b/gcc/params.opt
@@ -669,6 +669,10 @@ Maximum permissible cost for the sequence that would be generated by the RTL if-
 Common Joined UInteger Var(param_max_rtl_if_conversion_unpredictable_cost) Init(40) IntegerRange(0, 200) Param Optimization
 Maximum permissible cost for the sequence that would be generated by the RTL if-conversion pass for a branch that is considered unpredictable.
 +-param=ifcvt-allow-register-renaming=
 +Common Joined UInteger Var(param_ifcvt_allow_register_renaming) IntegerRange(0, 2) Param Optimization
 +Allow RTL if-conversion pass to aggressively rename registers in basic blocks.  Sometimes additional moves will be created.
 +
 -param=max-sched-extend-regions-iters=
 Common Joined UInteger Var(param_max_sched_extend_regions_iters) Param Optimization
 The maximum number of iterations through CFG to extend regions.
 diff --git a/gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c b/gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c
 new file mode 100644
 index 000000000..65c4d4140
 --- /dev/null
 +++ b/gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c
@@ -0,0 +1,35 @@
 +
 +extern void abort(void);
 +
 +__attribute__ ((noinline))
 +int foo (int x, int y, int z, int a, int b)
 +{
 +  if (a < 2) {
 +      if (a == 0) {
 +	  if (x - y < 0)
 +	    x = x - y + z;
 +	  else
 +	    x = x - y;
 +	}
 +      else {
 +	  if (x + y >= z)
 +	    x = x + y - z;
 +	  else
 +	    x = x + y;
 +	}
 +    }
 +  return x;
 +}
 +
 +int main(void) {
 +  if (foo (5,10,7,0,1) != 2) // x - y + z = -5 + 7 = 2
 +    abort ();
 +  if (foo (50,10,7,0,1) != 40) // x - y = 40
 +    abort ();
 +  if (foo (5,10,7,1,1) != 8) // x + y - z = 5 + 10 - 7 = 8
 +    abort ();
 +  if (foo (5,10,70,1,1) != 15) // x + y = 15
 +    abort ();
 +  return 0;
 +}
 +
 diff --git a/gcc/testsuite/gcc.dg/ifcvt-6.c b/gcc/testsuite/gcc.dg/ifcvt-6.c
 new file mode 100644
 index 000000000..be9a67b3f
 --- /dev/null
 +++ b/gcc/testsuite/gcc.dg/ifcvt-6.c
@@ -0,0 +1,27 @@
 +/* { dg-do compile { target { aarch64*-*-* } } } */
 +/* { dg-options "-fdump-rtl-ce1 -O2 --param max-rtl-if-conversion-unpredictable-cost=100 --param max-rtl-if-conversion-predictable-cost=100 --param=ifcvt-allow-register-renaming=2 -fifcvt-allow-complicated-cmps" } */
 +
 +typedef unsigned int uint16_t;
 +
 +uint16_t
 +foo (uint16_t x, uint16_t y, uint16_t z, uint16_t a,
 +     uint16_t b, uint16_t c, uint16_t d) {
 +  int i = 1;
 +  int j = 1;
 +  if (a > b) {
 +      j = x;
 +      if (b > c)
 +	i = y;
 +      else
 +	i = z;
 +    }
 +  else {
 +      j = y;
 +      if (c > d)
 +	i = z;
 +    }
 +  return i * j;
 +}
 +
 +/* { dg-final { scan-rtl-dump "7 true changes made" "ce1" } } */
 +
 -- 
 2.33.0
--- a/0037-Perform-early-if-conversion-of-simple-arithmetic.patch
+++ b/0037-Perform-early-if-conversion-of-simple-arithmetic.patch
@ -0,0 +1,109 @@
 From 310eade1450995b55d9f8120561022fbf164b2ec Mon Sep 17 00:00:00 2001
 From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
 Date: Thu, 12 Jan 2023 14:52:49 +0300
 Subject: [PATCH 03/18] Perform early if-conversion of simple arithmetic
 ---
 gcc/common.opt                      |  4 ++++
 gcc/match.pd                        | 25 +++++++++++++++++++
 gcc/testsuite/gcc.dg/ifcvt-gimple.c | 37 +++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/ifcvt-gimple.c
 diff --git a/gcc/common.opt b/gcc/common.opt
 index aa00fb7b0..dac477c04 100644
 --- a/gcc/common.opt
 +++ b/gcc/common.opt
@@ -1821,6 +1821,10 @@ fif-conversion2
 Common Var(flag_if_conversion2) Optimization
 Perform conversion of conditional jumps to conditional execution.
 +fif-conversion-gimple
 +Common Var(flag_if_conversion_gimple) Optimization
 +Perform conversion of conditional jumps to branchless equivalents during gimple transformations.
 +
 fstack-reuse=
 Common Joined RejectNegative Enum(stack_reuse_level) Var(flag_stack_reuse) Init(SR_ALL) Optimization
 -fstack-reuse=[all|named_vars|none]	Set stack reuse level for local variables.
 diff --git a/gcc/match.pd b/gcc/match.pd
 index 6f24d5079..3cbaf2a5b 100644
 --- a/gcc/match.pd
 +++ b/gcc/match.pd
@@ -4278,6 +4278,31 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   )
  )
 )
 +
 +(if (flag_if_conversion_gimple)
 + (for simple_op (plus minus bit_and bit_ior bit_xor)
 +  (simplify
 +   (cond @0 (simple_op @1 INTEGER_CST@2) @1)
 +   (switch
 +    /* a = cond ? a + 1 : a -> a = a + ((int) cond) */
 +    (if (integer_onep (@2))
 +     (simple_op @1 (convert (convert:boolean_type_node @0))))
 +    /* a = cond ? a + powerof2cst : a ->
 +       a = a + ((int) cond) << log2 (powerof2cst) */
 +    (if (INTEGRAL_TYPE_P (type) && integer_pow2p (@2))
 +     (with
 +      {
 +	tree shift = build_int_cst (integer_type_node, tree_log2 (@2));
 +      }
 +      (simple_op @1 (lshift (convert (convert:boolean_type_node @0))
 +			    { shift; })
 +      )
 +     )
 +    )
 +   )
 +  )
 + )
 +)
 #endif
 #if GIMPLE
 diff --git a/gcc/testsuite/gcc.dg/ifcvt-gimple.c b/gcc/testsuite/gcc.dg/ifcvt-gimple.c
 new file mode 100644
 index 000000000..0f7c87e5c
 --- /dev/null
 +++ b/gcc/testsuite/gcc.dg/ifcvt-gimple.c
@@ -0,0 +1,37 @@
 +/* { dg-do compile } */
 +/* { dg-options "-O2 -fif-conversion-gimple -fdump-tree-optimized" } */
 +
 +int test_int (int optimizable_int) {
 +    if (optimizable_int > 5)
 +	++optimizable_int;
 +    return optimizable_int;
 +}
 +
 +int test_int_pow2 (int optimizable_int_pow2) {
 +    if (optimizable_int_pow2 <= 4)
 +	optimizable_int_pow2 += 1024;
 +    return optimizable_int_pow2;
 +}
 +
 +int test_int_non_pow2 (int not_optimizable_int_non_pow2) {
 +    if (not_optimizable_int_non_pow2 == 1)
 +	not_optimizable_int_non_pow2 += 513;
 +    return not_optimizable_int_non_pow2;
 +}
 +
 +float test_float (float not_optimizable_float) {
 +    if (not_optimizable_float > 5)
 +	not_optimizable_float += 1;
 +    return not_optimizable_float;
 +}
 +
 +/* Expecting if-else block in test_float and test_int_non_pow2 only. */
 +/* { dg-final { scan-tree-dump-not "if \\(optimizable" "optimized" } } */
 +/* { dg-final { scan-tree-dump "if \\(not_optimizable_int_non_pow2" "optimized" } } */
 +/* { dg-final { scan-tree-dump "if \\(not_optimizable_float" "optimized" } } */
 +/* { dg-final { scan-tree-dump-times "if " 2 "optimized" } } */
 +/* { dg-final { scan-tree-dump-times "else" 2 "optimized" } } */
 +
 +/* Expecting shifted result only for optimizable_int_pow2. */
 +/* { dg-final { scan-tree-dump-times " << " 1 "optimized" } } */
 +/* { dg-final { scan-tree-dump " << 10;" "optimized" } } */
 -- 
 2.33.0
--- a/0038-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch
+++ b/0038-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch
@ -0,0 +1,252 @@
 From 6684509e81e4341675c73a7dc853180229a8abcb Mon Sep 17 00:00:00 2001
 From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
 Date: Tue, 24 Jan 2023 16:43:40 +0300
 Subject: [PATCH 04/18] Add option to allow matching uaddsub overflow for widen
 ops too.
 ---
 gcc/common.opt                 |   5 ++
 gcc/testsuite/gcc.dg/uaddsub.c | 143 +++++++++++++++++++++++++++++++++
 gcc/tree-ssa-math-opts.cc      |  43 ++++++++--
 3 files changed, 184 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/uaddsub.c
 diff --git a/gcc/common.opt b/gcc/common.opt
 index dac477c04..39c90604e 100644
 --- a/gcc/common.opt
 +++ b/gcc/common.opt
@@ -3106,6 +3106,11 @@ freciprocal-math
 Common Var(flag_reciprocal_math) SetByCombined Optimization
 Same as -fassociative-math for expressions which include division.
 +fuaddsub-overflow-match-all
 +Common Var(flag_uaddsub_overflow_match_all)
 +Match unsigned add/sub overflow even if the target does not support
 +the corresponding instruction.
 +
 ; Nonzero means that unsafe floating-point math optimizations are allowed
 ; for the sake of speed.  IEEE compliance is not guaranteed, and operations
 ; are allowed to assume that their arguments and results are "normal"
 diff --git a/gcc/testsuite/gcc.dg/uaddsub.c b/gcc/testsuite/gcc.dg/uaddsub.c
 new file mode 100644
 index 000000000..96c26d308
 --- /dev/null
 +++ b/gcc/testsuite/gcc.dg/uaddsub.c
@@ -0,0 +1,143 @@
 +/* { dg-do compile } */
 +/* { dg-options "-O2 -fuaddsub-overflow-match-all -fdump-tree-optimized" } */
 +#include <stdint.h>
 +
 +typedef unsigned __int128 uint128_t;
 +typedef struct uint256_t
 +{
 +  uint128_t lo;
 +  uint128_t hi;
 +} uint256_t;
 +
 +uint16_t add16 (uint8_t a, uint8_t b)
 +{
 +  uint8_t tmp = a + b;
 +  uint8_t overflow = 0;
 +  if (tmp < a)
 +    overflow = 1;
 +
 +  uint16_t res = overflow;
 +  res <<= 8;
 +  res += tmp;
 +  return res;
 +}
 +
 +uint32_t add32 (uint16_t a, uint16_t b)
 +{
 +  uint16_t tmp = a + b;
 +  uint16_t overflow = 0;
 +  if (tmp < a)
 +    overflow = 1;
 +
 +  uint32_t res = overflow;
 +  res <<= 16;
 +  res += tmp;
 +  return res;
 +}
 +
 +uint64_t add64 (uint32_t a, uint32_t b)
 +{
 +  uint32_t tmp = a + b;
 +  uint32_t overflow = 0;
 +  if (tmp < a)
 +    overflow = 1;
 +
 +  uint64_t res = overflow;
 +  res <<= 32;
 +  res += tmp;
 +  return res;
 +}
 +
 +uint128_t add128 (uint64_t a, uint64_t b)
 +{
 +  uint64_t tmp = a + b;
 +  uint64_t overflow = 0;
 +  if (tmp < a)
 +    overflow = 1;
 +
 +  uint128_t res = overflow;
 +  res <<= 64;
 +  res += tmp;
 +  return res;
 +}
 +
 +uint256_t add256 (uint128_t a, uint128_t b)
 +{
 +  uint128_t tmp = a + b;
 +  uint128_t overflow = 0;
 +  if (tmp < a)
 +    overflow = 1;
 +
 +  uint256_t res;
 +  res.hi = overflow;
 +  res.lo = tmp;
 +  return res;
 +}
 +
 +uint16_t sub16 (uint8_t a, uint8_t b)
 +{
 +  uint8_t tmp = a - b;
 +  uint8_t overflow = 0;
 +  if (tmp > a)
 +    overflow = -1;
 +
 +  uint16_t res = overflow;
 +  res <<= 8;
 +  res += tmp;
 +  return res;
 +}
 +
 +uint32_t sub32 (uint16_t a, uint16_t b)
 +{
 +  uint16_t tmp = a - b;
 +  uint16_t overflow = 0;
 +  if (tmp > a)
 +    overflow = -1;
 +
 +  uint32_t res = overflow;
 +  res <<= 16;
 +  res += tmp;
 +  return res;
 +}
 +
 +uint64_t sub64 (uint32_t a, uint32_t b)
 +{
 +  uint32_t tmp = a - b;
 +  uint32_t overflow = 0;
 +  if (tmp > a)
 +    overflow = -1;
 +
 +  uint64_t res = overflow;
 +  res <<= 32;
 +  res += tmp;
 +  return res;
 +}
 +
 +uint128_t sub128 (uint64_t a, uint64_t b)
 +{
 +  uint64_t tmp = a - b;
 +  uint64_t overflow = 0;
 +  if (tmp > a)
 +    overflow = -1;
 +
 +  uint128_t res = overflow;
 +  res <<= 64;
 +  res += tmp;
 +  return res;
 +}
 +
 +uint256_t sub256 (uint128_t a, uint128_t b)
 +{
 +  uint128_t tmp = a - b;
 +  uint128_t overflow = 0;
 +  if (tmp > a)
 +    overflow = -1;
 +
 +  uint256_t res;
 +  res.hi = overflow;
 +  res.lo = tmp;
 +  return res;
 +}
 +
 +/* { dg-final { scan-tree-dump-times "= .ADD_OVERFLOW \\(a_\[0-9\]+\\(D\\), b_\[0-9\]+\\(D\\)\\)" 5 "optimized" } } */
 +/* { dg-final { scan-tree-dump-times "= .SUB_OVERFLOW \\(a_\[0-9\]+\\(D\\), b_\[0-9\]+\\(D\\)\\)" 5 "optimized" } } */
 diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
 index 232e903b0..55d6ee8ae 100644
 --- a/gcc/tree-ssa-math-opts.cc
 +++ b/gcc/tree-ssa-math-opts.cc
@@ -3468,6 +3468,27 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
     }
 }
 +/* Check if the corresponding operation has wider equivalent on the target.  */
 +
 +static bool
 +wider_optab_check_p (optab op, machine_mode mode, int unsignedp)
 +{
 +  machine_mode wider_mode;
 +  FOR_EACH_WIDER_MODE (wider_mode, mode)
 +    {
 +      machine_mode next_mode;
 +      if (optab_handler (op, wider_mode) != CODE_FOR_nothing
 +	  || (op == smul_optab
 +	      && GET_MODE_WIDER_MODE (wider_mode).exists (&next_mode)
 +	      && (find_widening_optab_handler ((unsignedp
 +						? umul_widen_optab
 +						: smul_widen_optab),
 +						next_mode, mode))))
 +	return true;
 +    }
 +
 +  return false;
 +}
 /* Helper function of match_arith_overflow.  For MUL_OVERFLOW, if we have
    a check for non-zero like:
@@ -3903,15 +3924,22 @@ match_arith_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
 		       || code == MINUS_EXPR
 		       || code == MULT_EXPR
 		       || code == BIT_NOT_EXPR);
 +  int unsignedp = TYPE_UNSIGNED (type);
   if (!INTEGRAL_TYPE_P (type)
 -      || !TYPE_UNSIGNED (type)
 -      || has_zero_uses (lhs)
 -      || (code != PLUS_EXPR
 -	  && code != MULT_EXPR
 -	  && optab_handler (code == MINUS_EXPR ? usubv4_optab : uaddv4_optab,
 -			    TYPE_MODE (type)) == CODE_FOR_nothing))
 +      || !unsignedp
 +      || has_zero_uses (lhs))
     return false;
 +  if (code == PLUS_EXPR || code == MINUS_EXPR)
 +    {
 +      machine_mode mode = TYPE_MODE (type);
 +      optab op = code == PLUS_EXPR ? uaddv4_optab : usubv4_optab;
 +      if (optab_handler (op, mode) == CODE_FOR_nothing
 +	  && (!flag_uaddsub_overflow_match_all
 +	      || !wider_optab_check_p (op, mode, unsignedp)))
 +	return false;
 +    }
 +
   tree rhs1 = gimple_assign_rhs1 (stmt);
   tree rhs2 = gimple_assign_rhs2 (stmt);
   FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
@@ -3986,7 +4014,8 @@ match_arith_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
       || (code != MULT_EXPR && (code == BIT_NOT_EXPR ? use_seen : !use_seen))
       || (code == PLUS_EXPR
 	  && optab_handler (uaddv4_optab,
 -			    TYPE_MODE (type)) == CODE_FOR_nothing)
 +			    TYPE_MODE (type)) == CODE_FOR_nothing
 +	  && !flag_uaddsub_overflow_match_all)
       || (code == MULT_EXPR
 	  && optab_handler (cast_stmt ? mulv4_optab : umulv4_optab,
 			    TYPE_MODE (type)) == CODE_FOR_nothing))
 -- 
 2.33.0
--- a/0039-Match-double-sized-mul-pattern.patch
+++ b/0039-Match-double-sized-mul-pattern.patch
@ -0,0 +1,488 @@
 From e7b22f97f960b62e555dfd6f2e3ae43973fcbb3e Mon Sep 17 00:00:00 2001
 From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
 Date: Wed, 25 Jan 2023 15:04:07 +0300
 Subject: [PATCH 05/18] Match double sized mul pattern
 ---
 gcc/match.pd                              | 136 +++++++++++++++++++++
 gcc/testsuite/gcc.dg/double_sized_mul-1.c | 141 ++++++++++++++++++++++
 gcc/testsuite/gcc.dg/double_sized_mul-2.c |  62 ++++++++++
 gcc/tree-ssa-math-opts.cc                 |  80 ++++++++++++
 4 files changed, 419 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/double_sized_mul-1.c
 create mode 100644 gcc/testsuite/gcc.dg/double_sized_mul-2.c
 diff --git a/gcc/match.pd b/gcc/match.pd
 index 3cbaf2a5b..61866cb90 100644
 --- a/gcc/match.pd
 +++ b/gcc/match.pd
@@ -7895,3 +7895,139 @@ and,
 	       == TYPE_UNSIGNED (TREE_TYPE (@3))))
        && single_use (@4)
        && single_use (@5))))
 +
 +/* Match multiplication with double sized result.
 +
 +   Consider the following calculations:
 +   arg0 * arg1 = (2^(bit_size/2) * arg0_hi + arg0_lo)
 +	       * (2^(bit_size/2) * arg1_hi + arg1_lo)
 +   arg0 * arg1 = 2^bit_size * arg0_hi * arg1_hi
 +	       + 2^(bit_size/2) * (arg0_hi * arg1_lo + arg0_lo * arg1_hi)
 +	       + arg0_lo * arg1_lo
 +
 +   The products of high and low parts fits in bit_size values, thus they are
 +   placed in high and low parts of result respectively.
 +
 +   The sum of the mixed products may overflow, so we need a detection for that.
 +   Also it has a bit_size/2 offset, thus it intersects with both high and low
 +   parts of result.  Overflow detection constant is bit_size/2 due to this.
 +
 +   With this info:
 +   arg0 * arg1 = 2^bit_size * arg0_hi * arg1_hi
 +	       + 2^(bit_size/2) * middle
 +	       + 2^bit_size * possible_middle_overflow
 +	       + arg0_lo * arg1_lo
 +   arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + possible_middle_overflow)
 +	       + 2^(bit_size/2) * (2^(bit_size/2) * middle_hi + middle_lo)
 +	       + arg0_lo * arg1_lo
 +   arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + middle_hi
 +	       +	       possible_middle_overflow)
 +	       + 2^(bit_size/2) * middle_lo
 +	       + arg0_lo * arg1_lo
 +
 +   The last sum can produce overflow for the high result part.  With this:
 +   arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + possible_middle_overflow
 +	       +	       possible_res_lo_overflow + middle_hi)
 +	       + res_lo
 +	       = res_hi + res_lo
 +
 +   This formula is quite big to fit into one match pattern with all of the
 +   combinations of terms inside it.  There are many helpers for better code
 +   readability.
 +
 +   The simplification basis is res_hi: assuming that res_lo only is not
 +   real practical case for such calculations.
 +
 +   Overflow handling is done via matching complex calculations:
 +   the realpart and imagpart are quite handy here.  */
 +/* Match low and high parts of the argument.  */
 +(match (double_size_mul_arg_lo @0 @1)
 + (bit_and @0 INTEGER_CST@1)
 +  (if (wi::to_wide (@1)
 +       == wi::mask (TYPE_PRECISION (type) / 2, false, TYPE_PRECISION (type)))))
 +(match (double_size_mul_arg_hi @0 @1)
 + (rshift @0 INTEGER_CST@1)
 +  (if (wi::to_wide (@1) == TYPE_PRECISION (type) / 2)))
 +
 +/* Match various argument parts products.  */
 +(match (double_size_mul_lolo @0 @1)
 + (mult@4 (double_size_mul_arg_lo @0 @2) (double_size_mul_arg_lo @1 @3))
 +  (if (single_use (@4))))
 +(match (double_size_mul_hihi @0 @1)
 + (mult@4 (double_size_mul_arg_hi @0 @2) (double_size_mul_arg_hi @1 @3))
 +  (if (single_use (@4))))
 +(match (double_size_mul_lohi @0 @1)
 + (mult:c@4 (double_size_mul_arg_lo @0 @2) (double_size_mul_arg_hi @1 @3))
 +  (if (single_use (@4))))
 +
 +/* Match complex middle sum.  */
 +(match (double_size_mul_middle_complex @0 @1)
 + (IFN_ADD_OVERFLOW@2 (double_size_mul_lohi @0 @1) (double_size_mul_lohi @1 @0))
 +  (if (num_imm_uses (@2) == 2)))
 +
 +/* Match real middle results.  */
 +(match (double_size_mul_middle @0 @1)
 + (realpart@2 (double_size_mul_middle_complex @0 @1))
 +  (if (num_imm_uses (@2) == 2)))
 +(match (double_size_mul_middleres_lo @0 @1)
 + (lshift@3 (double_size_mul_middle @0 @1) INTEGER_CST@2)
 +  (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2
 +       && single_use (@3))))
 +(match (double_size_mul_middleres_hi @0 @1)
 + (rshift@3 (double_size_mul_middle @0 @1) INTEGER_CST@2)
 +  (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2
 +       && single_use (@3))))
 +
 +/* Match low result part.  */
 +/* Number of uses may be < 2 in case when we are interested in
 +   high part only.  */
 +(match (double_size_mul_res_lo_complex @0 @1)
 + (IFN_ADD_OVERFLOW:c@2
 +  (double_size_mul_lolo:c @0 @1) (double_size_mul_middleres_lo @0 @1))
 +  (if (num_imm_uses (@2) <= 2)))
 +(match (double_size_mul_res_lo @0 @1)
 + (realpart (double_size_mul_res_lo_complex @0 @1)))
 +
 +/* Match overflow terms.  */
 +(match (double_size_mul_overflow_check_lo @0 @1 @5)
 + (convert@4 (ne@3
 +  (imagpart@2 (double_size_mul_res_lo_complex@5 @0 @1)) integer_zerop))
 +  (if (single_use (@2) && single_use (@3) && single_use (@4))))
 +(match (double_size_mul_overflow_check_hi @0 @1)
 + (lshift@6 (convert@5 (ne@4
 +  (imagpart@3 (double_size_mul_middle_complex @0 @1)) integer_zerop))
 +	   INTEGER_CST@2)
 +  (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2
 +       && single_use (@3) && single_use (@4) && single_use (@5)
 +       && single_use (@6))))
 +
 +/* Match all possible permutations for high result part calculations.  */
 +(for op1 (double_size_mul_hihi
 +	  double_size_mul_overflow_check_hi
 +	  double_size_mul_middleres_hi)
 +     op2 (double_size_mul_overflow_check_hi
 +	  double_size_mul_middleres_hi
 +	  double_size_mul_hihi)
 +     op3 (double_size_mul_middleres_hi
 +	  double_size_mul_hihi
 +	  double_size_mul_overflow_check_hi)
 + (match (double_size_mul_candidate @0 @1 @2 @3)
 +  (plus:c@2
 +   (plus:c@4 (double_size_mul_overflow_check_lo @0 @1 @3) (op1:c @0 @1))
 +   (plus:c@5 (op2:c @0 @1) (op3:c @0 @1)))
 +    (if (single_use (@4) && single_use (@5))))
 + (match (double_size_mul_candidate @0 @1 @2 @3)
 +  (plus:c@2 (double_size_mul_overflow_check_lo @0 @1 @3)
 +   (plus:c@4 (op1:c @0 @1)
 +    (plus:c@5 (op2:c @0 @1) (op3:c @0 @1))))
 +     (if (single_use (@4) && single_use (@5))))
 + (match (double_size_mul_candidate @0 @1 @2 @3)
 +  (plus:c@2 (op1:c @0 @1)
 +   (plus:c@4 (double_size_mul_overflow_check_lo @0 @1 @3)
 +    (plus:c@5 (op2:c @0 @1) (op3:c @0 @1))))
 +     (if (single_use (@4) && single_use (@5))))
 + (match (double_size_mul_candidate @0 @1 @2 @3)
 +  (plus:c@2 (op1:c @0 @1)
 +   (plus:c@4 (op2:c @0 @1)
 +    (plus:c@5 (double_size_mul_overflow_check_lo @0 @1 @3) (op3:c @0 @1))))
 +     (if (single_use (@4) && single_use (@5)))))
 diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-1.c b/gcc/testsuite/gcc.dg/double_sized_mul-1.c
 new file mode 100644
 index 000000000..4d475cc8a
 --- /dev/null
 +++ b/gcc/testsuite/gcc.dg/double_sized_mul-1.c
@@ -0,0 +1,141 @@
 +/* { dg-do compile } */
 +/* fif-conversion-gimple and fuaddsub-overflow-match-all are required for
 +   proper overflow detection in some cases.  */
 +/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */
 +#include <stdint.h>
 +
 +typedef unsigned __int128 uint128_t;
 +
 +uint16_t mul16 (uint8_t a, uint8_t b)
 +{
 +  uint8_t a_lo = a & 0xF;
 +  uint8_t b_lo = b & 0xF;
 +  uint8_t a_hi = a >> 4;
 +  uint8_t b_hi = b >> 4;
 +  uint8_t lolo = a_lo * b_lo;
 +  uint8_t lohi = a_lo * b_hi;
 +  uint8_t hilo = a_hi * b_lo;
 +  uint8_t hihi = a_hi * b_hi;
 +  uint8_t middle = hilo + lohi;
 +  uint8_t middle_hi = middle >> 4;
 +  uint8_t middle_lo = middle << 4;
 +  uint8_t res_lo = lolo + middle_lo;
 +  uint8_t res_hi = hihi + middle_hi;
 +  res_hi += (res_lo < middle_lo ? 1 : 0);
 +  res_hi += (middle < hilo ? 0x10 : 0);
 +  uint16_t res = ((uint16_t) res_hi) << 8;
 +  res += res_lo;
 +  return res;
 +}
 +
 +uint32_t mul32 (uint16_t a, uint16_t b)
 +{
 +  uint16_t a_lo = a & 0xFF;
 +  uint16_t b_lo = b & 0xFF;
 +  uint16_t a_hi = a >> 8;
 +  uint16_t b_hi = b >> 8;
 +  uint16_t lolo = a_lo * b_lo;
 +  uint16_t lohi = a_lo * b_hi;
 +  uint16_t hilo = a_hi * b_lo;
 +  uint16_t hihi = a_hi * b_hi;
 +  uint16_t middle = hilo + lohi;
 +  uint16_t middle_hi = middle >> 8;
 +  uint16_t middle_lo = middle << 8;
 +  uint16_t res_lo = lolo + middle_lo;
 +  uint16_t res_hi = hihi + middle_hi;
 +  res_hi += (res_lo < middle_lo ? 1 : 0);
 +  res_hi += (middle < hilo ? 0x100 : 0);
 +  uint32_t res = ((uint32_t) res_hi) << 16;
 +  res += res_lo;
 +  return res;
 +}
 +
 +uint64_t mul64 (uint32_t a, uint32_t b)
 +{
 +  uint32_t a_lo = a & 0xFFFF;
 +  uint32_t b_lo = b & 0xFFFF;
 +  uint32_t a_hi = a >> 16;
 +  uint32_t b_hi = b >> 16;
 +  uint32_t lolo = a_lo * b_lo;
 +  uint32_t lohi = a_lo * b_hi;
 +  uint32_t hilo = a_hi * b_lo;
 +  uint32_t hihi = a_hi * b_hi;
 +  uint32_t middle = hilo + lohi;
 +  uint32_t middle_hi = middle >> 16;
 +  uint32_t middle_lo = middle << 16;
 +  uint32_t res_lo = lolo + middle_lo;
 +  uint32_t res_hi = hihi + middle_hi;
 +  res_hi += (res_lo < middle_lo ? 1 : 0);
 +  res_hi += (middle < hilo ? 0x10000 : 0);
 +  uint64_t res = ((uint64_t) res_hi) << 32;
 +  res += res_lo;
 +  return res;
 +}
 +
 +uint128_t mul128 (uint64_t a, uint64_t b)
 +{
 +  uint64_t a_lo = a & 0xFFFFFFFF;
 +  uint64_t b_lo = b & 0xFFFFFFFF;
 +  uint64_t a_hi = a >> 32;
 +  uint64_t b_hi = b >> 32;
 +  uint64_t lolo = a_lo * b_lo;
 +  uint64_t lohi = a_lo * b_hi;
 +  uint64_t hilo = a_hi * b_lo;
 +  uint64_t hihi = a_hi * b_hi;
 +  uint64_t middle = hilo + lohi;
 +  uint64_t middle_hi = middle >> 32;
 +  uint64_t middle_lo = middle << 32;
 +  uint64_t res_lo = lolo + middle_lo;
 +  uint64_t res_hi = hihi + middle_hi;
 +  res_hi += (res_lo < middle_lo ? 1 : 0);
 +  res_hi += (middle < hilo ? 0x100000000 : 0);
 +  uint128_t res = ((uint128_t) res_hi) << 64;
 +  res += res_lo;
 +  return res;
 +}
 +
 +uint64_t mul64_perm (uint32_t a, uint32_t b)
 +{
 +  uint32_t a_lo = a & 0xFFFF;
 +  uint32_t b_lo = b & 0xFFFF;
 +  uint32_t a_hi = a >> 16;
 +  uint32_t b_hi = b >> 16;
 +  uint32_t lolo = a_lo * b_lo;
 +  uint32_t lohi = a_lo * b_hi;
 +  uint32_t hilo = a_hi * b_lo;
 +  uint32_t hihi = a_hi * b_hi;
 +  uint32_t middle = hilo + lohi;
 +  uint32_t middle_hi = middle >> 16;
 +  uint32_t middle_lo = middle << 16;
 +  uint32_t res_lo = lolo + middle_lo;
 +  uint32_t res_hi = hihi + middle_hi;
 +  res_hi = res_lo < middle_lo ? res_hi + 1 : res_hi;
 +  res_hi = middle < hilo ? res_hi + 0x10000 : res_hi;
 +  uint64_t res = ((uint64_t) res_hi) << 32;
 +  res += res_lo;
 +  return res;
 +}
 +
 +uint128_t mul128_perm (uint64_t a, uint64_t b)
 +{
 +  uint64_t a_lo = a & 0xFFFFFFFF;
 +  uint64_t b_lo = b & 0xFFFFFFFF;
 +  uint64_t a_hi = a >> 32;
 +  uint64_t b_hi = b >> 32;
 +  uint64_t lolo = a_lo * b_lo;
 +  uint64_t lohi = a_lo * b_hi;
 +  uint64_t hilo = a_hi * b_lo;
 +  uint64_t hihi = a_hi * b_hi;
 +  uint64_t middle = hilo + lohi;
 +  uint64_t middle_hi = middle >> 32;
 +  uint64_t middle_lo = middle << 32;
 +  uint64_t res_lo = lolo + middle_lo;
 +  uint64_t res_hi = hihi + middle_hi;
 +  res_hi = res_lo < middle_lo ? res_hi + 1 : res_hi;
 +  res_hi = middle < hilo ? res_hi + 0x100000000 : res_hi;
 +  uint128_t res = ((uint128_t) res_hi) << 64;
 +  res += res_lo;
 +  return res;
 +}
 +
 +/* { dg-final { scan-tree-dump-times "double sized mul optimized: 1" 6 "widening_mul" } } */
 diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-2.c b/gcc/testsuite/gcc.dg/double_sized_mul-2.c
 new file mode 100644
 index 000000000..cc6e5af25
 --- /dev/null
 +++ b/gcc/testsuite/gcc.dg/double_sized_mul-2.c
@@ -0,0 +1,62 @@
 +/* { dg-do compile } */
 +/* fif-conversion-gimple is required for proper overflow detection
 +   in some cases.  */
 +/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */
 +#include <stdint.h>
 +
 +typedef unsigned __int128 uint128_t;
 +typedef struct uint256_t
 +{
 +    uint128_t lo;
 +    uint128_t hi;
 +} uint256_t;
 +
 +uint64_t mul64_double_use (uint32_t a, uint32_t b)
 +{
 +  uint32_t a_lo = a & 0xFFFF;
 +  uint32_t b_lo = b & 0xFFFF;
 +  uint32_t a_hi = a >> 16;
 +  uint32_t b_hi = b >> 16;
 +  uint32_t lolo = a_lo * b_lo;
 +  uint32_t lohi = a_lo * b_hi;
 +  uint32_t hilo = a_hi * b_lo;
 +  uint32_t hihi = a_hi * b_hi;
 +  uint32_t middle = hilo + lohi;
 +  uint32_t middle_hi = middle >> 16;
 +  uint32_t middle_lo = middle << 16;
 +  uint32_t res_lo = lolo + middle_lo;
 +  uint32_t res_hi = hihi + middle_hi;
 +  res_hi += (res_lo < middle_lo ? 1 : 0);
 +  res_hi += (middle < hilo ? 0x10000 : 0);
 +  uint64_t res = ((uint64_t) res_hi) << 32;
 +  res += res_lo;
 +  return res + lolo;
 +}
 +
 +uint256_t mul256 (uint128_t a, uint128_t b)
 +{
 +  uint128_t a_lo = a & 0xFFFFFFFFFFFFFFFF;
 +  uint128_t b_lo = b & 0xFFFFFFFFFFFFFFFF;
 +  uint128_t a_hi = a >> 64;
 +  uint128_t b_hi = b >> 64;
 +  uint128_t lolo = a_lo * b_lo;
 +  uint128_t lohi = a_lo * b_hi;
 +  uint128_t hilo = a_hi * b_lo;
 +  uint128_t hihi = a_hi * b_hi;
 +  uint128_t middle = hilo + lohi;
 +  uint128_t middle_hi = middle >> 64;
 +  uint128_t middle_lo = middle << 64;
 +  uint128_t res_lo = lolo + middle_lo;
 +  uint128_t res_hi = hihi + middle_hi;
 +  res_hi += (res_lo < middle_lo ? 1 : 0);
 +  /* Constant is to big warning WA */
 +  uint128_t overflow_tmp = (middle < hilo ? 1 : 0);
 +  overflow_tmp <<= 64;
 +  res_hi += overflow_tmp;
 +  uint256_t res;
 +  res.lo = res_lo;
 +  res.hi = res_hi;
 +  return res;
 +}
 +
 +/* { dg-final { scan-tree-dump-not "double sized mul optimized" "widening_mul" } } */
 diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
 index 55d6ee8ae..2c06b8a60 100644
 --- a/gcc/tree-ssa-math-opts.cc
 +++ b/gcc/tree-ssa-math-opts.cc
@@ -210,6 +210,9 @@ static struct
   /* Number of highpart multiplication ops inserted.  */
   int highpart_mults_inserted;
 +
 +  /* Number of optimized double sized multiplications.  */
 +  int double_sized_mul_optimized;
 } widen_mul_stats;
 /* The instance of "struct occurrence" representing the highest
@@ -4893,6 +4896,78 @@ optimize_spaceship (gimple *stmt)
 }
 +/* Pattern matcher for double sized multiplication defined in match.pd.  */
 +extern bool gimple_double_size_mul_candidate (tree, tree*, tree (*)(tree));
 +
 +static bool
 +convert_double_size_mul (gimple_stmt_iterator *gsi, gimple *stmt)
 +{
 +  gimple *use_stmt, *complex_res_lo;
 +  gimple_stmt_iterator insert_before;
 +  imm_use_iterator use_iter;
 +  tree match[4]; // arg0, arg1, res_hi, complex_res_lo
 +  tree arg0, arg1, widen_mult, new_type, tmp;
 +  tree lhs = gimple_assign_lhs (stmt);
 +  location_t loc = UNKNOWN_LOCATION;
 +  machine_mode mode;
 +
 +  if (!gimple_double_size_mul_candidate (lhs, match, NULL))
 +    return false;
 +
 +  new_type = build_nonstandard_integer_type (
 +	  TYPE_PRECISION (TREE_TYPE (match[0])) * 2, 1);
 +  mode = TYPE_MODE (new_type);
 +
 +  /* Early return if the target multiplication doesn't exist on target.  */
 +  if (optab_handler (smul_optab, mode) == CODE_FOR_nothing
 +      && !wider_optab_check_p (smul_optab, mode, 1))
 +    return false;
 +
 +  /* Determine the point where the wide multiplication
 +     should be inserted.  Complex low res is OK since it is required
 +     by both high and low part getters, thus it dominates both of them.  */
 +  complex_res_lo = SSA_NAME_DEF_STMT (match[3]);
 +  insert_before = gsi_for_stmt (complex_res_lo);
 +  gsi_next (&insert_before);
 +
 +  /* Create the widen multiplication.  */
 +  arg0 = build_and_insert_cast (&insert_before, loc, new_type, match[0]);
 +  arg1 = build_and_insert_cast (&insert_before, loc, new_type, match[1]);
 +  widen_mult = build_and_insert_binop (&insert_before, loc, "widen_mult",
 +				       MULT_EXPR, arg0, arg1);
 +
 +  /* Find the mult low part getter.  */
 +  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, match[3])
 +    if (gimple_assign_rhs_code (use_stmt) == REALPART_EXPR)
 +      break;
 +
 +  /* Create high and low (if needed) parts extractors.  */
 +  /* Low part.  */
 +  if (use_stmt)
 +    {
 +      loc = gimple_location (use_stmt);
 +      tmp = build_and_insert_cast (&insert_before, loc,
 +	  	      		   TREE_TYPE (gimple_get_lhs (use_stmt)),
 +	  			   widen_mult);
 +      gassign *new_stmt = gimple_build_assign (gimple_get_lhs (use_stmt),
 +	    				       NOP_EXPR, tmp);
 +      gsi_replace (&insert_before, new_stmt, true);
 +    }
 +
 +  /* High part.  */
 +  loc = gimple_location (stmt);
 +  tmp = build_and_insert_binop (gsi, loc, "widen_mult_hi",
 +				RSHIFT_EXPR, widen_mult,
 +				build_int_cst (new_type,
 +					       TYPE_PRECISION (new_type) / 2));
 +  tmp = build_and_insert_cast (gsi, loc, TREE_TYPE (lhs), tmp);
 +  gassign *new_stmt = gimple_build_assign (lhs, NOP_EXPR, tmp);
 +  gsi_replace (gsi, new_stmt, true);
 +
 +  widen_mul_stats.double_sized_mul_optimized++;
 +  return true;
 +}
 +
 /* Find integer multiplications where the operands are extended from
    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
    or MULT_HIGHPART_EXPR where appropriate.  */
@@ -4987,6 +5062,9 @@ math_opts_dom_walker::after_dom_children (basic_block bb)
 	      break;
 	    case PLUS_EXPR:
 +	      if (convert_double_size_mul (&gsi, stmt))
 +		break;
 +	      __attribute__ ((fallthrough));
 	    case MINUS_EXPR:
 	      if (!convert_plusminus_to_widen (&gsi, stmt, code))
 		match_arith_overflow (&gsi, stmt, code, m_cfg_changed_p);
@@ -5091,6 +5169,8 @@ pass_optimize_widening_mul::execute (function *fun)
 			    widen_mul_stats.divmod_calls_inserted);
   statistics_counter_event (fun, "highpart multiplications inserted",
 			    widen_mul_stats.highpart_mults_inserted);
 +  statistics_counter_event (fun, "double sized mul optimized",
 +			    widen_mul_stats.double_sized_mul_optimized);
   return cfg_changed ? TODO_cleanup_cfg : 0;
 }
 -- 
 2.33.0
--- a/0040-Port-icp-patch-to-GCC-12.patch
+++ b/0040-Port-icp-patch-to-GCC-12.patch
--- a/0041-Port-fixes-in-icp-to-GCC-12.patch
+++ b/0041-Port-fixes-in-icp-to-GCC-12.patch
@ -0,0 +1,100 @@
 From aaa117a9ff58fb208e8c8859e075ca425f995f63 Mon Sep 17 00:00:00 2001
 From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
 Date: Tue, 27 Feb 2024 07:43:57 +0800
 Subject: [PATCH 07/18] Port fixes in icp to GCC 12
 ---
 gcc/ipa-devirt.cc | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)
 diff --git a/gcc/ipa-devirt.cc b/gcc/ipa-devirt.cc
 index 383839189..318535d06 100644
 --- a/gcc/ipa-devirt.cc
 +++ b/gcc/ipa-devirt.cc
@@ -4431,6 +4431,11 @@ print_type_set(unsigned ftype_uid, type_alias_map *map)
   if (!map->count (ftype_uid))
     return;
   type_set* s = (*map)[ftype_uid];
 +  if (!s)
 +    {
 +      fprintf (dump_file, "%d (no set)", ftype_uid);
 +      return;
 +    }
   for (type_set::const_iterator it = s->begin (); it != s->end (); it++)
     fprintf (dump_file, it == s->begin () ? "%d" : ", %d", *it);
 }
@@ -4696,12 +4701,19 @@ maybe_register_aliases (tree type1, tree type2)
       if (register_ailas_type (type1, type2, ta_map))
 	analyze_pointees (type1, type2);
     }
 +  unsigned type1_uid = TYPE_UID (type1);
 +  unsigned type2_uid = TYPE_UID (type2);
 +  if (type_uid_map->count (type1_uid) == 0)
 +    (*type_uid_map)[type1_uid] = type1;
 +  if (type_uid_map->count (type2_uid) == 0)
 +    (*type_uid_map)[type2_uid] = type2;
 +
   /* If function and non-function type pointers alias,
      the function type is unsafe.  */
   if (FUNCTION_POINTER_TYPE_P (type1) && !FUNCTION_POINTER_TYPE_P (type2))
 -    unsafe_types->insert (TYPE_UID (type1));
 +    unsafe_types->insert (type1_uid);
   if (FUNCTION_POINTER_TYPE_P (type2) && !FUNCTION_POINTER_TYPE_P (type1))
 -    unsafe_types->insert (TYPE_UID (type2));
 +    unsafe_types->insert (type2_uid);
   /* Try to figure out with pointers to incomplete types.  */
   if (POINTER_TYPE_P (type1) && POINTER_TYPE_P (type2))
@@ -4825,10 +4837,12 @@ compare_block_and_init_type (tree block, tree t1)
 static void
 analyze_global_var (varpool_node *var)
 {
 -  var->get_constructor();
   tree decl = var->decl;
 -  if (TREE_CODE (decl) == SSA_NAME || !DECL_INITIAL (decl)
 -      || integer_zerop (DECL_INITIAL (decl)))
 +  if (decl || !DECL_INITIAL (decl))
 +    return;
 +  var->get_constructor ();
 +  if (TREE_CODE (decl) == SSA_NAME || integer_zerop (DECL_INITIAL (decl))
 +      || TREE_CODE (DECL_INITIAL (decl)) == ERROR_MARK)
     return;
   if (dump_file && (dump_flags & TDF_DETAILS))
@@ -4998,7 +5012,9 @@ analyze_assign_stmt (gimple *stmt)
     {
       rhs = TREE_OPERAND (rhs, 0);
       if (VAR_OR_FUNCTION_DECL_P (rhs) || TREE_CODE (rhs) == STRING_CST
 -	  || TREE_CODE (rhs) == ARRAY_REF || TREE_CODE (rhs) == PARM_DECL)
 +	  || TREE_CODE (rhs) == ARRAY_REF || TREE_CODE (rhs) == PARM_DECL
 +	  || TREE_CODE (rhs) == LABEL_DECL || TREE_CODE (rhs) == CONST_DECL
 +	  || TREE_CODE (rhs) == RESULT_DECL)
 	rhs_type = build_pointer_type (TREE_TYPE (rhs));
       else if (TREE_CODE (rhs) == COMPONENT_REF)
 	{
@@ -5012,7 +5028,12 @@ analyze_assign_stmt (gimple *stmt)
 	  gcc_assert (POINTER_TYPE_P (rhs_type));
 	}
       else
 -	gcc_unreachable();
 +	{
 +	  fprintf (dump_file, "\nUnsupported rhs type %s in assign stmt: ",
 +		   get_tree_code_name (TREE_CODE (rhs)));
 +	  print_gimple_stmt (dump_file, stmt, 0);
 +	  gcc_unreachable ();
 +	}
     }
   else
     rhs_type = TREE_TYPE (rhs);
@@ -5710,6 +5731,8 @@ merge_fs_map_for_ftype_aliases ()
       decl_set *d_set = it1->second;
       tree type = (*type_uid_map)[it1->first];
       type_set *set = (*fta_map)[it1->first];
 +      if (!set)
 +	continue;
       for (type_set::const_iterator it2 = set->begin ();
 	   it2 != set->end (); it2++)
 	{
 -- 
 2.33.0
--- a/0042-Add-split-complex-instructions-pass.patch
+++ b/0042-Add-split-complex-instructions-pass.patch
--- a/0043-Extending-and-refactoring-of-pass_split_complex_inst.patch
+++ b/0043-Extending-and-refactoring-of-pass_split_complex_inst.patch
--- a/0044-Port-maxmin-patch-to-GCC-12.patch
+++ b/0044-Port-maxmin-patch-to-GCC-12.patch
@ -0,0 +1,378 @@
 From a3013c074cd2ab5f71eb98a587a627f38c68656c Mon Sep 17 00:00:00 2001
 From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
 Date: Thu, 22 Feb 2024 17:07:24 +0800
 Subject: [PATCH 12/18] Port maxmin patch to GCC 12
 ---
 gcc/config/aarch64/aarch64-simd.md    | 256 ++++++++++++++++++++++++++
 gcc/config/aarch64/predicates.md      |  19 ++
 gcc/testsuite/gcc.dg/combine-maxmin.c |  46 +++++
 3 files changed, 321 insertions(+)
 create mode 100755 gcc/testsuite/gcc.dg/combine-maxmin.c
 diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
 index 82f73805f..de92802f5 100644
 --- a/gcc/config/aarch64/aarch64-simd.md
 +++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1138,6 +1138,82 @@
   [(set_attr "type" "neon_compare<q>,neon_shift_imm<q>")]
 )
 +;; Simplify the extension with following truncation for shift+neg operation.
 +
 +(define_insn_and_split "*aarch64_sshr_neg_v8hi"
 +  [(set (match_operand:V8HI 0 "register_operand" "=w")
 +	(vec_concat:V8HI
 +	  (truncate:V4HI
 +	    (ashiftrt:V4SI
 +	      (neg:V4SI
 +		(sign_extend:V4SI
 +		  (vec_select:V4HI
 +		    (match_operand:V8HI 1 "register_operand")
 +		    (match_operand:V8HI 3 "vect_par_cnst_lo_half"))))
 +	      (match_operand:V4SI 2 "maxmin_arith_shift_operand")))
 +	  (truncate:V4HI
 +	    (ashiftrt:V4SI
 +	      (neg:V4SI
 +		(sign_extend:V4SI
 +		  (vec_select:V4HI
 +		    (match_dup 1)
 +		    (match_operand:V8HI 4 "vect_par_cnst_hi_half"))))
 +	      (match_dup 2)))))]
 +  "TARGET_SIMD"
 +  "#"
 +  "&& true"
 +  [(set (match_operand:V8HI 0 "register_operand" "=w")
 +	(ashiftrt:V8HI
 +	  (neg:V8HI
 +	    (match_operand:V8HI 1 "register_operand" "w"))
 +	  (match_operand:V8HI 2 "aarch64_simd_imm_minus_one")))]
 +  {
 +    /* Reduce the shift amount to smaller mode.  */
 +    int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[2], 0))
 +	      - (GET_MODE_UNIT_BITSIZE (GET_MODE (operands[2])) / 2);
 +    operands[2] = aarch64_simd_gen_const_vector_dup (V8HImode, val);
 +  }
 +  [(set_attr "type" "multiple")]
 +)
 +
 +;; The helper definition that allows combiner to use the previous pattern.
 +
 +(define_insn_and_split "*aarch64_sshr_neg_tmpv8hi"
 +  [(set (match_operand:V8HI 0 "register_operand" "=w")
 +	(vec_concat:V8HI
 +	  (truncate:V4HI
 +	    (ashiftrt:V4SI
 +	      (neg:V4SI
 +		(match_operand:V4SI 1 "register_operand" "w"))
 +	      (match_operand:V4SI 2 "maxmin_arith_shift_operand")))
 +	  (truncate:V4HI
 +	    (ashiftrt:V4SI
 +	      (neg:V4SI
 +		(match_operand:V4SI 3 "register_operand" "w"))
 +	      (match_dup 2)))))]
 +  "TARGET_SIMD"
 +  "#"
 +  "&& true"
 +  [(set (match_operand:V4SI 1 "register_operand" "=w")
 +	(ashiftrt:V4SI
 +	  (neg:V4SI
 +	    (match_dup 1))
 +	  (match_operand:V4SI 2 "maxmin_arith_shift_operand")))
 +   (set (match_operand:V4SI 3 "register_operand" "=w")
 +	(ashiftrt:V4SI
 +	  (neg:V4SI
 +	    (match_dup 3))
 +	  (match_dup 2)))
 +   (set (match_operand:V8HI 0 "register_operand" "=w")
 +	(vec_concat:V8HI
 +	  (truncate:V4HI
 +	    (match_dup 1))
 +	  (truncate:V4HI
 +	    (match_dup 3))))]
 +  ""
 +  [(set_attr "type" "multiple")]
 +)
 +
 (define_insn "*aarch64_simd_sra<mode>"
  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
 	(plus:VDQ_I
@@ -1714,6 +1790,26 @@
  }
 )
 +(define_insn "vec_pack_trunc_shifted_<mode>"
 + [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=&w")
 +       (vec_concat:<VNARROWQ2>
 +	 (truncate:<VNARROWQ>
 +	   (ashiftrt:VQN (match_operand:VQN 1 "register_operand" "w")
 +	      (match_operand:VQN 2 "half_size_operand" "w")))
 +	 (truncate:<VNARROWQ>
 +	   (ashiftrt:VQN (match_operand:VQN 3 "register_operand" "w")
 +	      (match_operand:VQN 4 "half_size_operand" "w")))))]
 + "TARGET_SIMD"
 + {
 +   if (BYTES_BIG_ENDIAN)
 +     return "uzp2\\t%0.<V2ntype>, %3.<V2ntype>, %1.<V2ntype>";
 +   else
 +     return "uzp2\\t%0.<V2ntype>, %1.<V2ntype>, %3.<V2ntype>";
 + }
 +  [(set_attr "type" "neon_permute<q>")
 +   (set_attr "length" "4")]
 +)
 +
 (define_insn "aarch64_shrn<mode>_insn_le"
   [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
 	(vec_concat:<VNARROWQ2>
@@ -6652,6 +6748,166 @@
   [(set_attr "type" "neon_tst<q>")]
 )
 +;; Simplify the extension with following truncation for cmtst-like operation.
 +
 +(define_insn_and_split "*aarch64_cmtst_arith_v8hi"
 +  [(set (match_operand:V8HI 0 "register_operand" "=w")
 +	(vec_concat:V8HI
 +	  (plus:V4HI
 +	    (truncate:V4HI
 +	      (eq:V4SI
 +		(sign_extend:V4SI
 +		  (vec_select:V4HI
 +		    (and:V8HI
 +		      (match_operand:V8HI 1 "register_operand")
 +		      (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
 +		    (match_operand:V8HI 3 "vect_par_cnst_lo_half")))
 +		(match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero")))
 +	    (match_operand:V4HI 5 "aarch64_simd_imm_minus_one"))
 +	  (plus:V4HI
 +	    (truncate:V4HI
 +	      (eq:V4SI
 +		(sign_extend:V4SI
 +		  (vec_select:V4HI
 +		    (and:V8HI
 +		      (match_dup 1)
 +		      (match_dup 2))
 +		    (match_operand:V8HI 6 "vect_par_cnst_hi_half")))
 +		(match_dup 4)))
 +	    (match_dup 5))))]
 +  "TARGET_SIMD && !reload_completed"
 +  "#"
 +  "&& true"
 +  [(set (match_operand:V8HI 6 "register_operand" "=w")
 +	(match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
 +   (set (match_operand:V8HI 0 "register_operand" "=w")
 +	(plus:V8HI
 +	  (eq:V8HI
 +	    (and:V8HI
 +	      (match_operand:V8HI 1 "register_operand" "w")
 +	      (match_dup 6))
 +	    (match_operand:V8HI 4 "aarch64_simd_imm_zero"))
 +	  (match_operand:V8HI 5 "aarch64_simd_imm_minus_one")))]
 +  {
 +    if (can_create_pseudo_p ())
 +      {
 +	int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[4], 0));
 +	operands[4] = aarch64_simd_gen_const_vector_dup (V8HImode, val);
 +	int val2 = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[5], 0));
 +	operands[5] = aarch64_simd_gen_const_vector_dup (V8HImode, val2);
 +
 +	operands[6] = gen_reg_rtx (V8HImode);
 +      }
 +    else
 +      FAIL;
 +  }
 +  [(set_attr "type" "neon_tst_q")]
 +)
 +
 +;; Three helper definitions that allow combiner to use the previous pattern.
 +
 +(define_insn_and_split "*aarch64_cmtst_arith_tmp_lo_v8hi"
 +  [(set (match_operand:V4SI 0 "register_operand" "=w")
 +	(neg:V4SI
 +	  (eq:V4SI
 +	    (sign_extend:V4SI
 +	      (vec_select:V4HI
 +		(and:V8HI
 +		  (match_operand:V8HI 1 "register_operand")
 +		  (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
 +		(match_operand:V8HI 3 "vect_par_cnst_lo_half")))
 +	    (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
 +  "TARGET_SIMD && !reload_completed"
 +  "#"
 +  "&& true"
 +  [(set (match_operand:V8HI 5 "register_operand" "=w")
 +	(and:V8HI
 +	  (match_operand:V8HI 1 "register_operand")
 +	  (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")))
 +   (set (match_operand:V4SI 0 "register_operand" "=w")
 +	(sign_extend:V4SI
 +	  (vec_select:V4HI
 +	    (match_dup 5)
 +	    (match_operand:V8HI 3 "vect_par_cnst_lo_half"))))
 +   (set (match_dup 0)
 +	(neg:V4SI
 +	  (eq:V4SI
 +	    (match_dup 0)
 +	    (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
 +  {
 +    if (can_create_pseudo_p ())
 +      operands[5] = gen_reg_rtx (V8HImode);
 +    else
 +      FAIL;
 +  }
 +  [(set_attr "type" "multiple")]
 +)
 +
 +(define_insn_and_split "*aarch64_cmtst_arith_tmp_hi_v8hi"
 +  [(set (match_operand:V4SI 0 "register_operand" "=w")
 +	  (neg:V4SI
 +	    (eq:V4SI
 +	      (sign_extend:V4SI
 +		(vec_select:V4HI
 +		  (and:V8HI
 +		    (match_operand:V8HI 1 "register_operand")
 +		    (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
 +		  (match_operand:V8HI 3 "vect_par_cnst_hi_half")))
 +	      (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
 +  "TARGET_SIMD && !reload_completed"
 +  "#"
 +  "&& true"
 +  [(set (match_operand:V8HI 5 "register_operand" "=w")
 +	(and:V8HI
 +	  (match_operand:V8HI 1 "register_operand")
 +	  (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")))
 +   (set (match_operand:V4SI 0 "register_operand" "=w")
 +	(sign_extend:V4SI
 +	  (vec_select:V4HI
 +	    (match_dup 5)
 +	    (match_operand:V8HI 3 "vect_par_cnst_hi_half"))))
 +   (set (match_dup 0)
 +	  (neg:V4SI
 +	    (eq:V4SI
 +	      (match_dup 0)
 +	      (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
 +  {
 +    if (can_create_pseudo_p ())
 +      operands[5] = gen_reg_rtx (V8HImode);
 +    else
 +      FAIL;
 +  }
 +  [(set_attr "type" "multiple")]
 +)
 +
 +(define_insn_and_split "*aarch64_cmtst_arith_tmpv8hi"
 +  [(set (match_operand:V8HI 0 "register_operand" "=w")
 +	(vec_concat:V8HI
 +	  (truncate:V4HI
 +	    (not:V4SI
 +	      (match_operand:V4SI 1 "register_operand" "w")))
 +	  (truncate:V4HI
 +	    (not:V4SI
 +	      (match_operand:V4SI 2 "register_operand" "w")))))]
 +  "TARGET_SIMD"
 +  "#"
 +  "&& true"
 +  [(set (match_operand:V4SI 1 "register_operand" "=w")
 +	(not:V4SI
 +	  (match_dup 1)))
 +   (set (match_operand:V4SI 2 "register_operand" "=w")
 +	(not:V4SI
 +	  (match_dup 2)))
 +   (set (match_operand:V8HI 0 "register_operand" "=w")
 +	(vec_concat:V8HI
 +	  (truncate:V4HI
 +	    (match_dup 1))
 +	  (truncate:V4HI
 +	    (match_dup 2))))]
 +  ""
 +  [(set_attr "type" "multiple")]
 +)
 +
 (define_insn_and_split "aarch64_cmtstdi"
   [(set (match_operand:DI 0 "register_operand" "=w,r")
 	(neg:DI
 diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
 index 07c14aacb..1b8496c07 100644
 --- a/gcc/config/aarch64/predicates.md
 +++ b/gcc/config/aarch64/predicates.md
@@ -118,6 +118,25 @@
 	     (match_test "aarch64_simd_valid_immediate (op, NULL,
 							AARCH64_CHECK_ORR)"))))
 +(define_predicate "aarch64_bic_imm_for_maxmin"
 +   (match_code "const_vector")
 +{
 +  if (!aarch64_simd_valid_immediate (op, NULL, AARCH64_CHECK_BIC))
 +    return false;
 +  op = unwrap_const_vec_duplicate (op);
 +  unsigned int size = GET_MODE_UNIT_BITSIZE (mode);
 +  return CONST_INT_P (op)
 +	 && ((~UINTVAL (op)) < (((long unsigned int) 1 << size) - 1));
 +})
 +
 +(define_predicate "maxmin_arith_shift_operand"
 +   (match_code "const_vector")
 +{
 +  op = unwrap_const_vec_duplicate (op);
 +  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) - 1;
 +  return CONST_INT_P (op) && (UINTVAL (op) == size);
 +})
 +
 (define_predicate "aarch64_reg_or_bic_imm"
    (ior (match_operand 0 "register_operand")
 	(and (match_code "const_vector")
 diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c
 new file mode 100755
 index 000000000..06bce7029
 --- /dev/null
 +++ b/gcc/testsuite/gcc.dg/combine-maxmin.c
@@ -0,0 +1,46 @@
 +/* { dg-do compile { target aarch64-*-* } } */
 +/* { dg-options "-O3 -fdump-rtl-combine-all" } */
 +
 +/* The test checks usage of smax/smin insns for clip evaluation and
 + * uzp1/uzp2 insns for vector element narrowing.  It's inspired by
 + * sources of x264 codec.  */
 +
 +typedef unsigned char uint8_t;
 +typedef long int intptr_t;
 +typedef signed short int int16_t;
 +
 +static __attribute__((always_inline)) inline uint8_t clip (int x )
 +{
 +    return ( (x & ~((1 << 8)-1)) ? (-x)>>31 & ((1 << 8)-1) : x );
 +}
 +
 +void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
 +	 intptr_t stride, int width, int height, int16_t *buf)
 +{
 +    const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0;
 +    for( int y = 0; y < height; y++ ) {
 +        for( int x = -2; x < width+3; x++ ) {
 +            int v = ((src)[x-2*stride] + (src)[x+3*stride] - 5*((src)[x-stride]
 +		     + (src)[x+2*stride]) + 20*((src)[x] + (src)[x+stride]));
 +            dstv[x] = clip ( (v + 16) >> 5 );
 +            buf[x+2] = v + pad;
 +        }
 +        for( int x = 0; x < width; x++ )
 +            dstc[x] = clip ((((buf+2)[x-2*1] + (buf+2)[x+3*1] - 5*((buf+2)[x-1]
 +			      + (buf+2)[x+2*1]) + 20*((buf+2)[x] + (buf+2)[x+1]))
 +			     - 32*pad + 512) >> 10);
 +        for( int x = 0; x < width; x++ )
 +            dsth[x] = clip ((((src)[x-2*1] + (src)[x+3*1] - 5*((src)[x-1]
 +			      + (src)[x+2*1]) + 20*((src)[x] + (src)[x+1]))
 +			     + 16) >> 5);
 +        dsth += stride;
 +        dstv += stride;
 +        dstc += stride;
 +        src += stride;
 +    }
 +}
 +
 +/* { dg-final { scan-assembler-times {smax\t} 4 } }  */
 +/* { dg-final { scan-assembler-times {smin\t} 4 } }  */
 +/* { dg-final { scan-assembler-times {cmtst\t} 2 } }  */
 +/* { dg-final { scan-assembler-times {uzp1\t} 6 } }  */
 -- 
 2.33.0
--- a/0045-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch
+++ b/0045-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch
@ -0,0 +1,239 @@
 From 11da40d18e35219961226d40f11b0702b8649044 Mon Sep 17 00:00:00 2001
 From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
 Date: Thu, 22 Feb 2024 17:13:27 +0800
 Subject: [PATCH 13/18] Port moving minmask pattern to gimple to GCC 12
 ---
 gcc/common.opt                          |   4 +
 gcc/match.pd                            | 104 ++++++++++++++++++++++++
 gcc/testsuite/gcc.dg/combine-maxmin-1.c |  15 ++++
 gcc/testsuite/gcc.dg/combine-maxmin-2.c |  14 ++++
 gcc/testsuite/gcc.dg/combine-maxmin.c   |  19 +++--
 5 files changed, 151 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-1.c
 create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-2.c
 diff --git a/gcc/common.opt b/gcc/common.opt
 index 6c6fabb31..3a5004271 100644
 --- a/gcc/common.opt
 +++ b/gcc/common.opt
@@ -1846,6 +1846,10 @@ fif-conversion-gimple
 Common Var(flag_if_conversion_gimple) Optimization
 Perform conversion of conditional jumps to branchless equivalents during gimple transformations.
 +fconvert-minmax
 +Common Var(flag_convert_minmax) Optimization
 +Convert saturating clipping to min max.
 +
 fstack-reuse=
 Common Joined RejectNegative Enum(stack_reuse_level) Var(flag_stack_reuse) Init(SR_ALL) Optimization
 -fstack-reuse=[all|named_vars|none]	Set stack reuse level for local variables.
 diff --git a/gcc/match.pd b/gcc/match.pd
 index 61866cb90..3a19e93b3 100644
 --- a/gcc/match.pd
 +++ b/gcc/match.pd
@@ -8031,3 +8031,107 @@ and,
    (plus:c@4 (op2:c @0 @1)
     (plus:c@5 (double_size_mul_overflow_check_lo @0 @1 @3) (op3:c @0 @1))))
      (if (single_use (@4) && single_use (@5)))))
 +
 +/* MinMax pattern matching helpers.  More info on the transformation below.  */
 +
 +/* Match (a & 0b11..100..0) pattern.  */
 +(match (minmax_cmp_arg @0 @1)
 + (bit_and @0 INTEGER_CST@1)
 + (if (wi::popcount (~wi::to_widest (@1) + 1) == 1)))
 +
 +/* Match (inversed_sign_bit >> sign_bit_pos) pattern.
 +   This statement is blocking for the transformation of unsigned integers.
 +   Do type check here to avoid unnecessary duplications.  */
 +(match (minmax_sat_arg @0)
 + (rshift (negate @0) INTEGER_CST@1)
 + (if (!TYPE_UNSIGNED (TREE_TYPE (@0))
 +      && wi::eq_p (wi::to_widest (@1), TYPE_PRECISION (TREE_TYPE (@0)) - 1))))
 +
 +/* Transform ((x & ~mask) ? (-x)>>31 & mask : x) to (min (max (x, 0), mask)).
 +   The matched pattern can be described as saturated clipping.
 +
 +   The pattern supports truncation via both casts and bit_and.
 +   Also there are patterns for possible inverted conditions.  */
 +(if (flag_convert_minmax)
 +/* Truncation via casts.  Unfortunately convert? cannot be applied here
 +   because convert and cond take different number of arguments.  */
 + (simplify
 +  (convert
 +   (cond
 +    (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
 +    (convert? (minmax_sat_arg @0))
 +    (convert? @0)))
 +  (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
 +   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
 +    (convert (min (max @0 { integer_zero_node; })
 +		  { mask; })))))
 + (simplify
 +  (cond
 +   (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
 +   (convert? (minmax_sat_arg @0))
 +   (convert? @0))
 +  (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
 +   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
 +    (convert (min (max @0 { integer_zero_node; })
 +		  { mask; })))))
 +
 + (simplify
 +  (convert
 +   (cond
 +    (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
 +    (convert? @0)
 +    (convert? (minmax_sat_arg @0))))
 +  (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
 +   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
 +    (convert (min (max @0 { integer_zero_node; })
 +		  { mask; })))))
 + (simplify
 +  (cond
 +   (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
 +   (convert? @0)
 +   (convert? (minmax_sat_arg @0)))
 +  (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
 +   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
 +    (convert (min (max @0 { integer_zero_node; })
 +		  { mask; })))))
 +
 + /* Truncation via bit_and with mask.  Same concerns on convert? here.  */
 + (simplify
 +  (convert
 +   (cond
 +    (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
 +    (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))
 +    (convert? @0)))
 +  (if (wi::to_widest (@2) == ~wi::to_widest (@1))
 +   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
 +    (convert (min (max @0 { integer_zero_node; })
 +		  { mask; })))))
 + (simplify
 +  (cond
 +   (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
 +   (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))
 +   (convert? @0))
 +  (if (wi::to_widest (@2) == ~wi::to_widest (@1))
 +   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
 +    (convert (min (max @0 { integer_zero_node; })
 +		  { mask; })))))
 +
 + (simplify
 +  (convert
 +   (cond
 +    (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
 +    (convert? @0)
 +    (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))))
 +  (if (wi::to_widest (@2) == ~wi::to_widest (@1))
 +   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
 +    (convert (min (max @0 { integer_zero_node; })
 +		  { mask; })))))
 + (simplify
 +  (cond
 +   (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
 +   (convert? @0)
 +   (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2)))
 +  (if (wi::to_widest (@2) == ~wi::to_widest (@1))
 +   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
 +    (convert (min (max @0 { integer_zero_node; })
 +		  { mask; }))))))
 diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-1.c b/gcc/testsuite/gcc.dg/combine-maxmin-1.c
 new file mode 100644
 index 000000000..859ff7df8
 --- /dev/null
 +++ b/gcc/testsuite/gcc.dg/combine-maxmin-1.c
@@ -0,0 +1,15 @@
 +/* { dg-do compile { target aarch64-*-* } } */
 +/* { dg-options "-O3 -fconvert-minmax" } */
 +
 +#include <inttypes.h>
 +
 +__attribute__((noinline))
 +void test (int32_t *restrict a, int32_t *restrict x)
 +{
 +  for (int i = 0; i < 4; i++)
 +    a[i] = ((((-x[i]) >> 31) ^ x[i])
 +            & (-((int32_t)((x[i] & (~((1 << 8)-1))) == 0)))) ^ ((-x[i]) >> 31);
 +}
 +
 +/* { dg-final { scan-assembler-not {smax\t} } }  */
 +/* { dg-final { scan-assembler-not {smin\t} } }  */
 diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-2.c b/gcc/testsuite/gcc.dg/combine-maxmin-2.c
 new file mode 100644
 index 000000000..63d4d85b3
 --- /dev/null
 +++ b/gcc/testsuite/gcc.dg/combine-maxmin-2.c
@@ -0,0 +1,14 @@
 +/* { dg-do compile { target aarch64-*-* } } */
 +/* { dg-options "-O3 -fconvert-minmax" } */
 +
 +#include <inttypes.h>
 +
 +__attribute__((noinline))
 +void test (int8_t *restrict a, int32_t *restrict x)
 +{
 +  for (int i = 0; i < 8; i++)
 +    a[i] = ((x[i] & ~((1 << 9)-1)) ? (-x[i])>>31 & ((1 << 9)-1) : x[i]);
 +}
 +
 +/* { dg-final { scan-assembler-times {smax\t} 4 } }  */
 +/* { dg-final { scan-assembler-times {smin\t} 4 } }  */
 diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c
 index 06bce7029..a984fa560 100755
 --- a/gcc/testsuite/gcc.dg/combine-maxmin.c
 +++ b/gcc/testsuite/gcc.dg/combine-maxmin.c
@@ -1,5 +1,5 @@
 /* { dg-do compile { target aarch64-*-* } } */
 -/* { dg-options "-O3 -fdump-rtl-combine-all" } */
 +/* { dg-options "-O3 -fconvert-minmax" } */
 /* The test checks usage of smax/smin insns for clip evaluation and
  * uzp1/uzp2 insns for vector element narrowing.  It's inspired by
@@ -19,20 +19,26 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
 {
     const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0;
     for( int y = 0; y < height; y++ ) {
 +        /* This loop is not being vectorized now.  */
         for( int x = -2; x < width+3; x++ ) {
             int v = ((src)[x-2*stride] + (src)[x+3*stride] - 5*((src)[x-stride]
 		     + (src)[x+2*stride]) + 20*((src)[x] + (src)[x+stride]));
             dstv[x] = clip ( (v + 16) >> 5 );
             buf[x+2] = v + pad;
         }
 +
 +        /* Produces two versions of the code: 3xUZP1/2xMAX/2xMIN + 1xUZP1/1xMAX/1xMIN.  */
         for( int x = 0; x < width; x++ )
             dstc[x] = clip ((((buf+2)[x-2*1] + (buf+2)[x+3*1] - 5*((buf+2)[x-1]
 			      + (buf+2)[x+2*1]) + 20*((buf+2)[x] + (buf+2)[x+1]))
 			     - 32*pad + 512) >> 10);
 +
 +        /* Priduces two versions of the code: 1xUZP1/2xMAX/2xMIN + 0xUZP1/1xMAX/1xMIN.  */
         for( int x = 0; x < width; x++ )
             dsth[x] = clip ((((src)[x-2*1] + (src)[x+3*1] - 5*((src)[x-1]
 			      + (src)[x+2*1]) + 20*((src)[x] + (src)[x+1]))
 			     + 16) >> 5);
 +
         dsth += stride;
         dstv += stride;
         dstc += stride;
@@ -40,7 +46,10 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
     }
 }
 -/* { dg-final { scan-assembler-times {smax\t} 4 } }  */
 -/* { dg-final { scan-assembler-times {smin\t} 4 } }  */
 -/* { dg-final { scan-assembler-times {cmtst\t} 2 } }  */
 -/* { dg-final { scan-assembler-times {uzp1\t} 6 } }  */
 +/* Max is performed on 0 from signed values, match smax exactly.  */
 +/* { dg-final { scan-assembler-times {smax\t} 6 } }  */
 +/* Min is performed on signed val>0 and a mask, min sign doesn't matter.  */
 +/* { dg-final { scan-assembler-times {[us]min\t} 6 } }  */
 +/* All of the vectorized patterns are expected to be matched.  */
 +/* { dg-final { scan-assembler-not {cmtst\t} } }  */
 +/* { dg-final { scan-assembler-times {uzp1\t} 5 } }  */
 -- 
 2.33.0
--- a/0046-Add-new-pattern-to-pass-the-maxmin-tests.patch
+++ b/0046-Add-new-pattern-to-pass-the-maxmin-tests.patch
@ -0,0 +1,65 @@
 From dbcb2630c426c8dd2117b5ce625da8422dd8cd65 Mon Sep 17 00:00:00 2001
 From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
 Date: Thu, 22 Feb 2024 17:20:17 +0800
 Subject: [PATCH 14/18] Add new pattern to pass the maxmin tests
 ---
 gcc/match.pd                          | 24 ++++++++++++++++++++++++
 gcc/testsuite/gcc.dg/combine-maxmin.c |  2 +-
 2 files changed, 25 insertions(+), 1 deletion(-)
 diff --git a/gcc/match.pd b/gcc/match.pd
 index 3a19e93b3..aee58e47b 100644
 --- a/gcc/match.pd
 +++ b/gcc/match.pd
@@ -8038,6 +8038,10 @@ and,
 (match (minmax_cmp_arg @0 @1)
  (bit_and @0 INTEGER_CST@1)
  (if (wi::popcount (~wi::to_widest (@1) + 1) == 1)))
 +/* Match ((unsigned) a > 0b0..01..1) pattern.  */
 +(match (minmax_cmp_arg1 @0 @1)
 + (gt @0 INTEGER_CST@1)
 + (if (wi::popcount (wi::to_widest (@1) + 1) == 1)))
 /* Match (inversed_sign_bit >> sign_bit_pos) pattern.
    This statement is blocking for the transformation of unsigned integers.
@@ -8095,6 +8099,26 @@ and,
     (convert (min (max @0 { integer_zero_node; })
 		  { mask; })))))
 + (simplify
 +  (convert
 +   (cond
 +    (minmax_cmp_arg1 (convert? @0) INTEGER_CST@1)
 +    (convert? (minmax_sat_arg @0))
 +    (convert? @0)))
 +  (if (wi::geu_p (wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
 +   (with { tree mask = build_int_cst (integer_type_node, tree_to_shwi (@1)); }
 +    (convert (min (max (convert:integer_type_node @0) { integer_zero_node; })
 +		  { mask; })))))
 + (simplify
 +  (cond
 +   (minmax_cmp_arg1 (convert? @0) INTEGER_CST@1)
 +   (convert? (minmax_sat_arg @0))
 +   (convert? @0))
 +  (if (wi::geu_p (wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
 +   (with { tree mask = build_int_cst (integer_type_node, tree_to_shwi (@1)); }
 +    (convert (min (max (convert:integer_type_node @0) { integer_zero_node; })
 +		  { mask; })))))
 +
  /* Truncation via bit_and with mask.  Same concerns on convert? here.  */
  (simplify
   (convert
 diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c
 index a984fa560..5c0c9cc49 100755
 --- a/gcc/testsuite/gcc.dg/combine-maxmin.c
 +++ b/gcc/testsuite/gcc.dg/combine-maxmin.c
@@ -52,4 +52,4 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
 /* { dg-final { scan-assembler-times {[us]min\t} 6 } }  */
 /* All of the vectorized patterns are expected to be matched.  */
 /* { dg-final { scan-assembler-not {cmtst\t} } }  */
 -/* { dg-final { scan-assembler-times {uzp1\t} 5 } }  */
 +/* { dg-final { scan-assembler-times {uzp1\t} 2 } }  */
 -- 
 2.33.0
--- a/0047-AES-Implement-AES-pattern-matching.patch
+++ b/0047-AES-Implement-AES-pattern-matching.patch
--- a/0048-crypto-accel-add-optimization-level-requirement-to-t.patch
+++ b/0048-crypto-accel-add-optimization-level-requirement-to-t.patch
@ -0,0 +1,27 @@
 From 915d549b03c10ab403538888149facd417a02ebc Mon Sep 17 00:00:00 2001
 From: vchernon <chernonog.vyacheslav@huawei.com>
 Date: Wed, 27 Dec 2023 23:31:26 +0800
 Subject: [PATCH 16/18] [crypto-accel] add optimization level requirement to
 the gate
 fix issue (src-openEuler/gcc: I8RRDW)
 ---
 gcc/crypto-accel.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 diff --git a/gcc/crypto-accel.cc b/gcc/crypto-accel.cc
 index f4e810a6b..e7766a585 100644
 --- a/gcc/crypto-accel.cc
 +++ b/gcc/crypto-accel.cc
@@ -2391,7 +2391,7 @@ public:
   /* opt_pass methods: */
   virtual bool gate (function *)
     {
 -      if (flag_crypto_accel_aes <= 0)
 +      if (flag_crypto_accel_aes <= 0 || optimize < 1)
 	return false;
       return targetm.get_v16qi_mode
 	&& targetm.gen_rev32v16qi
 -- 
 2.33.0
--- a/0049-Add-more-flexible-check-for-pointer-aliasing-during-.patch
+++ b/0049-Add-more-flexible-check-for-pointer-aliasing-during-.patch
@ -0,0 +1,239 @@
 From b5865aef36ebaac87ae30d51f08bfe081795ed67 Mon Sep 17 00:00:00 2001
 From: Chernonog Viacheslav <chernonog.vyacheslav@huawei.com>
 Date: Tue, 12 Mar 2024 23:30:56 +0800
 Subject: [PATCH 17/18] Add more flexible check for pointer aliasing during
 vectorization It takes minimum between number of iteration and segment length
 it helps to speed up loops with small number of iterations when only tail can
 be vectorized
 ---
 gcc/params.opt                                |  5 ++
 .../sve/var_stride_flexible_segment_len_1.c   | 23 +++++++
 gcc/tree-data-ref.cc                          | 67 +++++++++++++------
 gcc/tree-data-ref.h                           | 11 ++-
 gcc/tree-vect-data-refs.cc                    | 14 +++-
 5 files changed, 95 insertions(+), 25 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
 diff --git a/gcc/params.opt b/gcc/params.opt
 index 6176d4790..7e5c119cf 100644
 --- a/gcc/params.opt
 +++ b/gcc/params.opt
@@ -1180,6 +1180,11 @@ Maximum number of loop peels to enhance alignment of data references in a loop.
 Common Joined UInteger Var(param_vect_max_version_for_alias_checks) Init(10) Param Optimization
 Bound on number of runtime checks inserted by the vectorizer's loop versioning for alias check.
 +-param=vect-alias-flexible-segment-len=
 +Common Joined UInteger Var(param_flexible_seg_len) Init(0) IntegerRange(0, 1) Param Optimization
 +Use a minimum length of different segments.  Currenlty the minimum between
 +iteration number and vectorization length is chosen by this param.
 +
 -param=vect-max-version-for-alignment-checks=
 Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization
 Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check.
 diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
 new file mode 100644
 index 000000000..894f075f3
 --- /dev/null
 +++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
@@ -0,0 +1,23 @@
 +/* { dg-do compile } */
 +/* { dg-options "-O2 -ftree-vectorize --param=vect-alias-flexible-segment-len=1" } */
 +
 +#define TYPE int
 +#define SIZE 257
 +
 +void __attribute__ ((weak))
 +f (TYPE *x, TYPE *y, unsigned short n, long m __attribute__((unused)))
 +{
 +  for (int i = 0; i < SIZE; ++i)
 +    x[i * n] += y[i * n];
 +}
 +
 +/* { dg-final { scan-assembler {\tld1w\tz[0-9]+} } } */
 +/* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */
 +/* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
 +/* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
 +/* Should use a WAR check that multiplies by (VF-2)*4 rather than
 +   an overlap check that multiplies by (257-1)*4.  */
 +/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #8\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */
 +/* One range check and a check for n being zero.  */
 +/* { dg-final { scan-assembler-times {\t(?:cmp|tst)\t} 2 } } */
 +/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */
 diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
 index 397792c35..e6ae9e847 100644
 --- a/gcc/tree-data-ref.cc
 +++ b/gcc/tree-data-ref.cc
@@ -2329,31 +2329,15 @@ create_intersect_range_checks_index (class loop *loop, tree *cond_expr,
    same arguments.  Try to optimize cases in which the second access
    is a write and in which some overlap is valid.  */
 -static bool
 -create_waw_or_war_checks (tree *cond_expr,
 +static void
 +create_waw_or_war_checks2 (tree *cond_expr, tree seg_len_a,
 			  const dr_with_seg_len_pair_t &alias_pair)
 {
   const dr_with_seg_len& dr_a = alias_pair.first;
   const dr_with_seg_len& dr_b = alias_pair.second;
 -  /* Check for cases in which:
 -
 -     (a) DR_B is always a write;
 -     (b) the accesses are well-ordered in both the original and new code
 -	 (see the comment above the DR_ALIAS_* flags for details); and
 -     (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR.  */
 -  if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
 -    return false;
 -
 -  /* Check for equal (but possibly variable) steps.  */
   tree step = DR_STEP (dr_a.dr);
 -  if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
 -    return false;
 -
 -  /* Make sure that we can operate on sizetype without loss of precision.  */
   tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
 -  if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
 -    return false;
   /* All addresses involved are known to have a common alignment ALIGN.
      We can therefore subtract ALIGN from an exclusive endpoint to get
@@ -2370,9 +2354,6 @@ create_waw_or_war_checks (tree *cond_expr,
 			       fold_convert (ssizetype, indicator),
 			       ssize_int (0));
 -  /* Get lengths in sizetype.  */
 -  tree seg_len_a
 -    = fold_convert (sizetype, rewrite_to_non_trapping_overflow (dr_a.seg_len));
   step = fold_convert (sizetype, rewrite_to_non_trapping_overflow (step));
   /* Each access has the following pattern:
@@ -2479,6 +2460,50 @@ create_waw_or_war_checks (tree *cond_expr,
   *cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject, limit);
   if (dump_enabled_p ())
     dump_printf (MSG_NOTE, "using an address-based WAR/WAW test\n");
 +}
 +
 +/* This is a wrapper function for create_waw_or_war_checks2.  */
 +static bool
 +create_waw_or_war_checks (tree *cond_expr,
 +			  const dr_with_seg_len_pair_t &alias_pair)
 +{
 +  const dr_with_seg_len& dr_a = alias_pair.first;
 +  const dr_with_seg_len& dr_b = alias_pair.second;
 +
 +  /* Check for cases in which:
 +
 +     (a) DR_B is always a write;
 +     (b) the accesses are well-ordered in both the original and new code
 +     (see the comment above the DR_ALIAS_* flags for details); and
 +     (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR.  */
 +  if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
 +    return false;
 +
 +  /* Check for equal (but possibly variable) steps.  */
 +  tree step = DR_STEP (dr_a.dr);
 +  if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
 +    return false;
 +
 +  /* Make sure that we can operate on sizetype without loss of precision.  */
 +  tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
 +  if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
 +    return false;
 +
 +  /* Get lengths in sizetype.  */
 +  tree seg_len_a
 +    = fold_convert (sizetype,
 +		    rewrite_to_non_trapping_overflow (dr_a.seg_len));
 +  create_waw_or_war_checks2 (cond_expr, seg_len_a, alias_pair);
 +  if (param_flexible_seg_len && dr_a.seg_len != dr_a.seg_len2)
 +    {
 +      tree seg_len2_a
 +	= fold_convert (sizetype,
 +			rewrite_to_non_trapping_overflow (dr_a.seg_len2));
 +      tree cond_expr2;
 +      create_waw_or_war_checks2 (&cond_expr2, seg_len2_a, alias_pair);
 +      *cond_expr =  fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
 +				 *cond_expr, cond_expr2);
 +   }
   return true;
 }
 diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h
 index f643a95b2..9bc5f16ee 100644
 --- a/gcc/tree-data-ref.h
 +++ b/gcc/tree-data-ref.h
@@ -213,12 +213,19 @@ class dr_with_seg_len
 public:
   dr_with_seg_len (data_reference_p d, tree len, unsigned HOST_WIDE_INT size,
 		   unsigned int a)
 -    : dr (d), seg_len (len), access_size (size), align (a) {}
 -
 +    : dr (d), seg_len (len), seg_len2 (len), access_size (size), align (a)
 +    {}
 +  dr_with_seg_len (data_reference_p d, tree len, tree len2,
 +		   unsigned HOST_WIDE_INT size, unsigned int a)
 +    : dr (d), seg_len (len), seg_len2 (len2), access_size (size), align (a)
 +    {}
   data_reference_p dr;
   /* The offset of the last access that needs to be checked minus
      the offset of the first.  */
   tree seg_len;
 +  /* The second version of segment length.  Currently this is used to
 +     soften checks for a small number of iterations.  */
 +  tree seg_len2;
   /* A value that, when added to abs (SEG_LEN), gives the total number of
      bytes in the segment.  */
   poly_uint64 access_size;
 diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
 index 4e615b80b..04e68f621 100644
 --- a/gcc/tree-vect-data-refs.cc
 +++ b/gcc/tree-vect-data-refs.cc
@@ -3646,6 +3646,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
     {
       poly_uint64 lower_bound;
       tree segment_length_a, segment_length_b;
 +      tree segment_length2_a, segment_length2_b;
       unsigned HOST_WIDE_INT access_size_a, access_size_b;
       unsigned int align_a, align_b;
@@ -3751,6 +3752,8 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
 	{
 	  segment_length_a = size_zero_node;
 	  segment_length_b = size_zero_node;
 +	  segment_length2_a = size_zero_node;
 +	  segment_length2_b = size_zero_node;
 	}
       else
 	{
@@ -3759,8 +3762,15 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
 	    length_factor = scalar_loop_iters;
 	  else
 	    length_factor = size_int (vect_factor);
 +	  /* In any case we should rememeber scalar_loop_iters
 +	     this helps to create flexible aliasing check
 +	     for small number of iterations.  */
 	  segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
 	  segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
 +	  segment_length2_a
 +	    = vect_vfa_segment_size (dr_info_a, scalar_loop_iters);
 +	  segment_length2_b
 +	    = vect_vfa_segment_size (dr_info_b, scalar_loop_iters);
 	}
       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
@@ -3805,9 +3815,9 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
 	}
       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
 -			    access_size_a, align_a);
 +			    segment_length2_a, access_size_a, align_a);
       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
 -			    access_size_b, align_b);
 +			    segment_length2_b, access_size_b, align_b);
       /* Canonicalize the order to be the one that's needed for accurate
 	 RAW, WAR and WAW flags, in cases where the data references are
 	 well-ordered.  The order doesn't really matter otherwise,
 -- 
 2.33.0
--- a/0050-Port-IPA-prefetch-to-GCC-12.patch
+++ b/0050-Port-IPA-prefetch-to-GCC-12.patch
--- a/0051-Port-fixes-for-IPA-prefetch-to-GCC-12.patch
+++ b/0051-Port-fixes-for-IPA-prefetch-to-GCC-12.patch
--- a/0052-Fix-fails-in-IPA-prefetch-src-openEuler-gcc-I96ID7.patch
+++ b/0052-Fix-fails-in-IPA-prefetch-src-openEuler-gcc-I96ID7.patch
@ -0,0 +1,94 @@
 From 0263daa1312d0cdcdf9c770bcf5d982a2d4fc16b Mon Sep 17 00:00:00 2001
 From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
 Date: Fri, 29 Mar 2024 17:15:41 +0800
 Subject: [PATCH 2/2] Fix fails in IPA prefetch (src-openEuler/gcc: I96ID7)
 ---
 gcc/ipa-prefetch.cc | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)
 diff --git a/gcc/ipa-prefetch.cc b/gcc/ipa-prefetch.cc
 index 9537e4835..1ceb5137f 100644
 --- a/gcc/ipa-prefetch.cc
 +++ b/gcc/ipa-prefetch.cc
@@ -366,6 +366,7 @@ typedef std::map<memref_t *, memref_t *> memref_map;
 typedef std::map<memref_t *, tree> memref_tree_map;
 typedef std::set<gimple *> stmt_set;
 +typedef std::set<tree> tree_set;
 typedef std::map<tree, tree> tree_map;
 tree_memref_map *tm_map;
@@ -1124,8 +1125,21 @@ analyse_loops ()
     }
 }
 +/* Compare memrefs by IDs; helper for qsort.  */
 +
 +static int
 +memref_id_cmp (const void *p1, const void *p2)
 +{
 +  const memref_t *mr1 = *(const memref_t **) p1;
 +  const memref_t *mr2 = *(const memref_t **) p2;
 +
 +  if ((unsigned) mr1->mr_id > (unsigned) mr2->mr_id)
 +    return 1;
 +  return -1;
 +}
 +
 /* Reduce the set filtering out memrefs with the same memory references,
 -   return the result vector of memrefs.  */
 +   sort and return the result vector of memrefs.  */
 static void
 reduce_memref_set (memref_set *set, vec<memref_t *> &vec)
@@ -1162,6 +1176,7 @@ reduce_memref_set (memref_set *set, vec<memref_t *> &vec)
 	    vec.safe_push (mr1);
 	}
     }
 +  vec.qsort (memref_id_cmp);
   if (dump_file)
     {
       fprintf (dump_file, "MRs (%d) after filtering: ", vec.length ());
@@ -1663,10 +1678,15 @@ optimize_function (cgraph_node *n, function *fn)
     }
   /* Create other new vars.  Insert new stmts.  */
 +  vec<memref_t *> used_mr_vec = vNULL;
   for (memref_set::const_iterator it = used_mrs.begin ();
        it != used_mrs.end (); it++)
 +    used_mr_vec.safe_push (*it);
 +  used_mr_vec.qsort (memref_id_cmp);
 +
 +  for (unsigned int j = 0; j < used_mr_vec.length (); j++)
     {
 -      memref_t *mr = *it;
 +      memref_t *mr = used_mr_vec[j];
       if (mr == comp_mr)
 	continue;
       gimple *last_stmt = gimple_copy_and_remap_memref_stmts (mr, stmts, 0,
@@ -1702,6 +1722,7 @@ optimize_function (cgraph_node *n, function *fn)
       local = integer_three_node;
       break;
     }
 +  tree_set prefetched_addrs;
   for (unsigned int j = 0; j < vmrs.length (); j++)
     {
       memref_t *mr = vmrs[j];
@@ -1714,10 +1735,13 @@ optimize_function (cgraph_node *n, function *fn)
       tree addr = get_mem_ref_address_ssa_name (mr->mem, NULL_TREE);
       if (decl_map->count (addr))
 	addr = (*decl_map)[addr];
 +      if (prefetched_addrs.count (addr))
 +	continue;
       last_stmt = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
 				     3, addr, write_p, local);
       pcalls.safe_push (last_stmt);
       gimple_seq_add_stmt (&stmts, last_stmt);
 +      prefetched_addrs.insert (addr);
       if (dump_file)
 	{
 	  fprintf (dump_file, "Insert %d prefetch stmt:\n", j);
 -- 
 2.33.0
--- a/0053-struct-reorg-Add-Semi-Relayout.patch
+++ b/0053-struct-reorg-Add-Semi-Relayout.patch
--- a/0054-Struct-Reorg-Bugfix-for-structure-pointer-compressio.patch
+++ b/0054-Struct-Reorg-Bugfix-for-structure-pointer-compressio.patch
@ -0,0 +1,28 @@
 From 9dc3df938b9ed2c27498c8548087fee1ce930366 Mon Sep 17 00:00:00 2001
 From: =?UTF-8?q?=E9=83=91=E6=99=A8=E5=8D=89?= <zhengchenhui1@huawei.com>
 Date: Tue, 2 Apr 2024 11:08:30 +0800
 Subject: [PATCH] [Struct Reorg] Bugfix for structure pointer compression
 ---
 gcc/ipa-struct-reorg/ipa-struct-reorg.cc | 2 ++
 1 file changed, 2 insertions(+)
 diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
 index fa33f2d35..3922873f3 100644
 --- a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
 +++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
@@ -7541,9 +7541,11 @@ ipa_struct_reorg::check_and_prune_struct_for_pointer_compression (void)
       if (!type->has_legal_alloc_num)
 	{
 	  if (current_layout_opt_level & POINTER_COMPRESSION_UNSAFE)
 +	    {
 	    if (dump_file)
 	      fprintf (dump_file, " has unknown alloc size, but"
 				  " in unsafe mode, so");
 +	    }
 	  else
 	    {
 	      if (dump_file)
 -- 
 2.33.0
--- a/gcc.spec
+++ b/gcc.spec
@ -2,7 +2,7 @@
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 20
+%global gcc_release 21
 %global _unpackaged_files_terminate_build 0
 %global _performance_build 1
@ -172,6 +172,26 @@ Patch31: 0031-AutoBOLT-Support-saving-feedback-count-info-to-ELF-s.patch
 Patch32: 0032-AutoBOLT-Add-bolt-linker-plugin-2-3.patch
 Patch33: 0033-AutoBOLT-Enable-BOLT-linker-plugin-on-aarch64-3-3.patch
 Patch34: 0034-Autofdo-Enable-discrimibator-and-MCF-algorithm-on-Au.patch
 Patch35: 0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch
 Patch36: 0036-rtl-ifcvt-introduce-rtl-ifcvt-enchancements.patch           
 Patch37: 0037-Perform-early-if-conversion-of-simple-arithmetic.patch      
 Patch38: 0038-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch  
 Patch39: 0039-Match-double-sized-mul-pattern.patch                        
 Patch40: 0040-Port-icp-patch-to-GCC-12.patch                              
 Patch41: 0041-Port-fixes-in-icp-to-GCC-12.patch
 Patch42: 0042-Add-split-complex-instructions-pass.patch                   
 Patch43: 0043-Extending-and-refactoring-of-pass_split_complex_inst.patch
 Patch44: 0044-Port-maxmin-patch-to-GCC-12.patch
 Patch45: 0045-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch
 Patch46: 0046-Add-new-pattern-to-pass-the-maxmin-tests.patch
 Patch47: 0047-AES-Implement-AES-pattern-matching.patch
 Patch48: 0048-crypto-accel-add-optimization-level-requirement-to-t.patch
 Patch49: 0049-Add-more-flexible-check-for-pointer-aliasing-during-.patch
 Patch50: 0050-Port-IPA-prefetch-to-GCC-12.patch
 Patch51: 0051-Port-fixes-for-IPA-prefetch-to-GCC-12.patch
 Patch52: 0052-Fix-fails-in-IPA-prefetch-src-openEuler-gcc-I96ID7.patch
 Patch53: 0053-struct-reorg-Add-Semi-Relayout.patch
 Patch54: 0054-Struct-Reorg-Bugfix-for-structure-pointer-compressio.patch
 # Part 3000 ~ 4999
 %ifarch loongarch64
@ -801,6 +821,26 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch32 -p1
 %patch33 -p1
 %patch34 -p1
 %patch35 -p1
 %patch36 -p1
 %patch37 -p1
 %patch38 -p1
 %patch39 -p1
 %patch40 -p1
 %patch41 -p1
 %patch42 -p1
 %patch43 -p1
 %patch44 -p1
 %patch45 -p1
 %patch46 -p1
 %patch47 -p1
 %patch48 -p1
 %patch49 -p1
 %patch50 -p1
 %patch51 -p1
 %patch52 -p1
 %patch53 -p1
 %patch54 -p1
 %ifarch loongarch64
 %patch3001 -p1
@ -3186,6 +3226,10 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 %changelog
 * Thu Apr 11 2024 Zhengchen Hui <zhengchenhui1@huawei.com> - 12.3.1-21
 - Type: Sync
 - DESC: Sync patch from openeuler/gcc
 * Thu Apr 11 2024 Zhenyu Zhao <zhaozhenyu17@huawei.com> - 12.3.1-20
 - Type: Sync
 - DESC: Sync patch from openeuler/gcc