[Sync] Sync patch from openeuler/gcc
This commit is contained in:
parent
5db544f251
commit
3ee8545fc2
194
0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch
Normal file
194
0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch
Normal file
@ -0,0 +1,194 @@
|
||||
From aa39a66f6029fe16a656d7c6339908b953fb1e04 Mon Sep 17 00:00:00 2001
|
||||
From: Diachkov Ilia WX1215920 <diachkov.ilia1@huawei-partners.com>
|
||||
Date: Thu, 22 Feb 2024 11:27:43 +0300
|
||||
Subject: [PATCH 01/18] Add insn defs and correct costs for cmlt generation
|
||||
|
||||
---
|
||||
gcc/config/aarch64/aarch64-simd.md | 48 +++++++++++++++++++++++++++++
|
||||
gcc/config/aarch64/aarch64.cc | 15 +++++++++
|
||||
gcc/config/aarch64/aarch64.opt | 4 +++
|
||||
gcc/config/aarch64/iterators.md | 3 +-
|
||||
gcc/config/aarch64/predicates.md | 25 +++++++++++++++
|
||||
gcc/testsuite/gcc.dg/combine-cmlt.c | 20 ++++++++++++
|
||||
6 files changed, 114 insertions(+), 1 deletion(-)
|
||||
create mode 100755 gcc/testsuite/gcc.dg/combine-cmlt.c
|
||||
|
||||
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
|
||||
index ee7f0b89c..82f73805f 100644
|
||||
--- a/gcc/config/aarch64/aarch64-simd.md
|
||||
+++ b/gcc/config/aarch64/aarch64-simd.md
|
||||
@@ -6454,6 +6454,54 @@
|
||||
[(set_attr "type" "neon_compare<q>, neon_compare_zero<q>")]
|
||||
)
|
||||
|
||||
+;; Use cmlt to replace vector arithmetic operations like this (SImode example):
|
||||
+;; B = (((A >> 15) & 0x00010001) << 16) - ((A >> 15) & 0x00010001)
|
||||
+;; TODO: maybe extend to scalar operations or other cm** instructions.
|
||||
+
|
||||
+(define_insn "*aarch64_cmlt_as_arith<mode>"
|
||||
+ [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
|
||||
+ (minus:<V_INT_EQUIV>
|
||||
+ (ashift:<V_INT_EQUIV>
|
||||
+ (and:<V_INT_EQUIV>
|
||||
+ (lshiftrt:<V_INT_EQUIV>
|
||||
+ (match_operand:VDQHSD 1 "register_operand" "w")
|
||||
+ (match_operand:VDQHSD 2 "half_size_minus_one_operand"))
|
||||
+ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand"))
|
||||
+ (match_operand:VDQHSD 4 "half_size_operand"))
|
||||
+ (and:<V_INT_EQUIV>
|
||||
+ (lshiftrt:<V_INT_EQUIV>
|
||||
+ (match_dup 1)
|
||||
+ (match_dup 2))
|
||||
+ (match_dup 3))))]
|
||||
+ "TARGET_SIMD && flag_cmlt_arith"
|
||||
+ "cmlt\t%<v>0.<V2ntype>, %<v>1.<V2ntype>, #0"
|
||||
+ [(set_attr "type" "neon_compare_zero")]
|
||||
+)
|
||||
+
|
||||
+;; The helper definition that allows combiner to use the previous pattern.
|
||||
+
|
||||
+(define_insn_and_split "*arch64_cmlt_tmp<mode>"
|
||||
+ [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
|
||||
+ (and:<V_INT_EQUIV>
|
||||
+ (lshiftrt:<V_INT_EQUIV>
|
||||
+ (match_operand:VDQHSD 1 "register_operand" "w")
|
||||
+ (match_operand:VDQHSD 2 "half_size_minus_one_operand"))
|
||||
+ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
|
||||
+ "TARGET_SIMD && flag_cmlt_arith"
|
||||
+ "#"
|
||||
+ "&& reload_completed"
|
||||
+ [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
|
||||
+ (lshiftrt:<V_INT_EQUIV>
|
||||
+ (match_operand:VDQHSD 1 "register_operand")
|
||||
+ (match_operand:VDQHSD 2 "half_size_minus_one_operand")))
|
||||
+ (set (match_dup 0)
|
||||
+ (and:<V_INT_EQUIV>
|
||||
+ (match_dup 0)
|
||||
+ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
|
||||
+ ""
|
||||
+ [(set_attr "type" "neon_compare_zero")]
|
||||
+)
|
||||
+
|
||||
(define_insn_and_split "aarch64_cm<optab>di"
|
||||
[(set (match_operand:DI 0 "register_operand" "=w,w,r")
|
||||
(neg:DI
|
||||
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
|
||||
index a3da4ca30..04072ca25 100644
|
||||
--- a/gcc/config/aarch64/aarch64.cc
|
||||
+++ b/gcc/config/aarch64/aarch64.cc
|
||||
@@ -14064,6 +14064,21 @@ cost_minus:
|
||||
return true;
|
||||
}
|
||||
|
||||
+ /* Detect aarch64_cmlt_as_arith instruction. Now only this pattern
|
||||
+ matches the condition. The costs of cmlt and sub instructions
|
||||
+ are comparable, so we are not increasing the cost here. */
|
||||
+ if (flag_cmlt_arith && GET_CODE (op0) == ASHIFT
|
||||
+ && GET_CODE (op1) == AND)
|
||||
+ {
|
||||
+ rtx op0_subop0 = XEXP (op0, 0);
|
||||
+ if (rtx_equal_p (op0_subop0, op1))
|
||||
+ {
|
||||
+ rtx lshrt_op = XEXP (op0_subop0, 0);
|
||||
+ if (GET_CODE (lshrt_op) == LSHIFTRT)
|
||||
+ return true;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
/* Look for SUB (extended register). */
|
||||
if (is_a <scalar_int_mode> (mode)
|
||||
&& aarch64_rtx_arith_op_extract_p (op1))
|
||||
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
|
||||
index a64b927e9..101664c7c 100644
|
||||
--- a/gcc/config/aarch64/aarch64.opt
|
||||
+++ b/gcc/config/aarch64/aarch64.opt
|
||||
@@ -262,6 +262,10 @@ Use an immediate to offset from the stack protector guard register, sp_el0.
|
||||
This option is for use with fstack-protector-strong and not for use in
|
||||
user-land code.
|
||||
|
||||
+mcmlt-arith
|
||||
+Target Var(flag_cmlt_arith) Optimization Init(0)
|
||||
+Use SIMD cmlt instruction to perform some arithmetic/logic calculations.
|
||||
+
|
||||
TargetVariable
|
||||
long aarch64_stack_protector_guard_offset = 0
|
||||
|
||||
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
|
||||
index 26a840d7f..967e6b0b1 100644
|
||||
--- a/gcc/config/aarch64/iterators.md
|
||||
+++ b/gcc/config/aarch64/iterators.md
|
||||
@@ -1485,7 +1485,8 @@
|
||||
(V2DI "2s")])
|
||||
|
||||
;; Register suffix narrowed modes for VQN.
|
||||
-(define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h")
|
||||
+(define_mode_attr V2ntype [(V4HI "8b") (V2SI "4h")
|
||||
+ (V8HI "16b") (V4SI "8h")
|
||||
(V2DI "4s")])
|
||||
|
||||
;; Widened modes of vector modes.
|
||||
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
|
||||
index c308015ac..07c14aacb 100644
|
||||
--- a/gcc/config/aarch64/predicates.md
|
||||
+++ b/gcc/config/aarch64/predicates.md
|
||||
@@ -49,6 +49,31 @@
|
||||
return CONST_INT_P (op) && IN_RANGE (INTVAL (op), 1, 3);
|
||||
})
|
||||
|
||||
+(define_predicate "half_size_minus_one_operand"
|
||||
+ (match_code "const_vector")
|
||||
+{
|
||||
+ op = unwrap_const_vec_duplicate (op);
|
||||
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
|
||||
+ return CONST_INT_P (op) && (UINTVAL (op) == size - 1);
|
||||
+})
|
||||
+
|
||||
+(define_predicate "half_size_operand"
|
||||
+ (match_code "const_vector")
|
||||
+{
|
||||
+ op = unwrap_const_vec_duplicate (op);
|
||||
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
|
||||
+ return CONST_INT_P (op) && (UINTVAL (op) == size);
|
||||
+})
|
||||
+
|
||||
+(define_predicate "cmlt_arith_mask_operand"
|
||||
+ (match_code "const_vector")
|
||||
+{
|
||||
+ op = unwrap_const_vec_duplicate (op);
|
||||
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
|
||||
+ unsigned long long mask = ((unsigned long long) 1 << size) | 1;
|
||||
+ return CONST_INT_P (op) && (UINTVAL (op) == mask);
|
||||
+})
|
||||
+
|
||||
(define_predicate "subreg_lowpart_operator"
|
||||
(ior (match_code "truncate")
|
||||
(and (match_code "subreg")
|
||||
diff --git a/gcc/testsuite/gcc.dg/combine-cmlt.c b/gcc/testsuite/gcc.dg/combine-cmlt.c
|
||||
new file mode 100755
|
||||
index 000000000..b4c9a37ff
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.dg/combine-cmlt.c
|
||||
@@ -0,0 +1,20 @@
|
||||
+/* { dg-do compile { target aarch64-*-* } } */
|
||||
+/* { dg-options "-O3 -mcmlt-arith" } */
|
||||
+
|
||||
+/* The test checks usage of cmlt insns for arithmetic/logic calculations
|
||||
+ * in foo (). It's inspired by sources of x264 codec. */
|
||||
+
|
||||
+typedef unsigned short int uint16_t;
|
||||
+typedef unsigned int uint32_t;
|
||||
+
|
||||
+void foo( uint32_t *a, uint32_t *b)
|
||||
+{
|
||||
+ for (unsigned i = 0; i < 4; i++)
|
||||
+ {
|
||||
+ uint32_t s = ((a[i]>>((8 * sizeof(uint16_t))-1))
|
||||
+ &(((uint32_t)1<<(8 * sizeof(uint16_t)))+1))*((uint16_t)-1);
|
||||
+ b[i] = (a[i]+s)^s;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+/* { dg-final { scan-assembler-times {cmlt\t} 1 } } */
|
||||
--
|
||||
2.33.0
|
||||
|
||||
560
0036-rtl-ifcvt-introduce-rtl-ifcvt-enchancements.patch
Normal file
560
0036-rtl-ifcvt-introduce-rtl-ifcvt-enchancements.patch
Normal file
@ -0,0 +1,560 @@
|
||||
From 4cae948c1c00ad7a59f0f234f809fbd9a0208eb4 Mon Sep 17 00:00:00 2001
|
||||
From: vchernon <chernonog.vyacheslav@huawei.com>
|
||||
Date: Wed, 28 Feb 2024 23:05:12 +0800
|
||||
Subject: [PATCH 02/18] [rtl-ifcvt] introduce rtl ifcvt enchancements new
|
||||
option: -fifcvt-allow-complicated-cmps: allows ifcvt to deal
|
||||
with complicated cmps like
|
||||
|
||||
cmp reg1 (reg2 + reg3)
|
||||
|
||||
can increase compilation time
|
||||
new param:
|
||||
-param=ifcvt-allow-register-renaming=[0,1,2]
|
||||
1 : allows ifcvt to rename registers in then and else bb
|
||||
2 : allows to rename registers in condition and else/then bb
|
||||
can increase compilation time and register pressure
|
||||
---
|
||||
gcc/common.opt | 4 +
|
||||
gcc/ifcvt.cc | 291 +++++++++++++++---
|
||||
gcc/params.opt | 4 +
|
||||
.../gcc.c-torture/execute/ifcvt-renaming-1.c | 35 +++
|
||||
gcc/testsuite/gcc.dg/ifcvt-6.c | 27 ++
|
||||
5 files changed, 311 insertions(+), 50 deletions(-)
|
||||
create mode 100644 gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c
|
||||
create mode 100644 gcc/testsuite/gcc.dg/ifcvt-6.c
|
||||
|
||||
diff --git a/gcc/common.opt b/gcc/common.opt
|
||||
index c7c6bc256..aa00fb7b0 100644
|
||||
--- a/gcc/common.opt
|
||||
+++ b/gcc/common.opt
|
||||
@@ -3691,4 +3691,8 @@ fipa-ra
|
||||
Common Var(flag_ipa_ra) Optimization
|
||||
Use caller save register across calls if possible.
|
||||
|
||||
+fifcvt-allow-complicated-cmps
|
||||
+Common Var(flag_ifcvt_allow_complicated_cmps) Optimization
|
||||
+Allow RTL if-conversion pass to deal with complicated cmps (can increase compilation time).
|
||||
+
|
||||
; This comment is to ensure we retain the blank line above.
|
||||
diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc
|
||||
index 2c1eba312..584db7b55 100644
|
||||
--- a/gcc/ifcvt.cc
|
||||
+++ b/gcc/ifcvt.cc
|
||||
@@ -886,7 +886,9 @@ noce_emit_store_flag (struct noce_if_info *if_info, rtx x, int reversep,
|
||||
}
|
||||
|
||||
/* Don't even try if the comparison operands or the mode of X are weird. */
|
||||
- if (cond_complex || !SCALAR_INT_MODE_P (GET_MODE (x)))
|
||||
+ if (!flag_ifcvt_allow_complicated_cmps
|
||||
+ && (cond_complex
|
||||
+ || !SCALAR_INT_MODE_P (GET_MODE (x))))
|
||||
return NULL_RTX;
|
||||
|
||||
return emit_store_flag (x, code, XEXP (cond, 0),
|
||||
@@ -1965,7 +1967,8 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc)
|
||||
/* Currently support only simple single sets in test_bb. */
|
||||
if (!sset
|
||||
|| !noce_operand_ok (SET_DEST (sset))
|
||||
- || contains_ccmode_rtx_p (SET_DEST (sset))
|
||||
+ || (!flag_ifcvt_allow_complicated_cmps
|
||||
+ && contains_ccmode_rtx_p (SET_DEST (sset)))
|
||||
|| !noce_operand_ok (SET_SRC (sset)))
|
||||
return false;
|
||||
|
||||
@@ -1979,13 +1982,17 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc)
|
||||
in this function. */
|
||||
|
||||
static bool
|
||||
-bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
|
||||
+bbs_ok_for_cmove_arith (basic_block bb_a,
|
||||
+ basic_block bb_b,
|
||||
+ rtx to_rename,
|
||||
+ bitmap conflict_regs)
|
||||
{
|
||||
rtx_insn *a_insn;
|
||||
bitmap bba_sets = BITMAP_ALLOC (®_obstack);
|
||||
-
|
||||
+ bitmap intersections = BITMAP_ALLOC (®_obstack);
|
||||
df_ref def;
|
||||
df_ref use;
|
||||
+ rtx_insn *last_a = last_active_insn (bb_a, FALSE);
|
||||
|
||||
FOR_BB_INSNS (bb_a, a_insn)
|
||||
{
|
||||
@@ -1995,18 +2002,15 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
|
||||
rtx sset_a = single_set (a_insn);
|
||||
|
||||
if (!sset_a)
|
||||
- {
|
||||
- BITMAP_FREE (bba_sets);
|
||||
- return false;
|
||||
- }
|
||||
+ goto end_cmove_arith_check_and_fail;
|
||||
/* Record all registers that BB_A sets. */
|
||||
FOR_EACH_INSN_DEF (def, a_insn)
|
||||
- if (!(to_rename && DF_REF_REG (def) == to_rename))
|
||||
+ if (!(to_rename && DF_REF_REG (def) == to_rename && a_insn == last_a))
|
||||
bitmap_set_bit (bba_sets, DF_REF_REGNO (def));
|
||||
}
|
||||
|
||||
+ bitmap_and (intersections, df_get_live_in (bb_b), bba_sets);
|
||||
rtx_insn *b_insn;
|
||||
-
|
||||
FOR_BB_INSNS (bb_b, b_insn)
|
||||
{
|
||||
if (!active_insn_p (b_insn))
|
||||
@@ -2015,10 +2019,7 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
|
||||
rtx sset_b = single_set (b_insn);
|
||||
|
||||
if (!sset_b)
|
||||
- {
|
||||
- BITMAP_FREE (bba_sets);
|
||||
- return false;
|
||||
- }
|
||||
+ goto end_cmove_arith_check_and_fail;
|
||||
|
||||
/* Make sure this is a REG and not some instance
|
||||
of ZERO_EXTRACT or SUBREG or other dangerous stuff.
|
||||
@@ -2030,25 +2031,34 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename)
|
||||
if (MEM_P (SET_DEST (sset_b)))
|
||||
gcc_assert (rtx_equal_p (SET_DEST (sset_b), to_rename));
|
||||
else if (!REG_P (SET_DEST (sset_b)))
|
||||
- {
|
||||
- BITMAP_FREE (bba_sets);
|
||||
- return false;
|
||||
- }
|
||||
+ goto end_cmove_arith_check_and_fail;
|
||||
|
||||
- /* If the insn uses a reg set in BB_A return false. */
|
||||
+ /* If the insn uses a reg set in BB_A return false
|
||||
+ or try to collect register list for renaming. */
|
||||
FOR_EACH_INSN_USE (use, b_insn)
|
||||
{
|
||||
- if (bitmap_bit_p (bba_sets, DF_REF_REGNO (use)))
|
||||
+ if (bitmap_bit_p (intersections, DF_REF_REGNO (use)))
|
||||
{
|
||||
- BITMAP_FREE (bba_sets);
|
||||
- return false;
|
||||
+ if (param_ifcvt_allow_register_renaming < 1)
|
||||
+ goto end_cmove_arith_check_and_fail;
|
||||
+
|
||||
+ /* Those regs should be renamed. We can't rename CC reg, but
|
||||
+ possibly we can provide combined comparison in the future. */
|
||||
+ if (GET_MODE_CLASS (GET_MODE (DF_REF_REG (use))) == MODE_CC)
|
||||
+ goto end_cmove_arith_check_and_fail;
|
||||
+ bitmap_set_bit (conflict_regs, DF_REF_REGNO (use));
|
||||
}
|
||||
}
|
||||
-
|
||||
}
|
||||
|
||||
BITMAP_FREE (bba_sets);
|
||||
+ BITMAP_FREE (intersections);
|
||||
return true;
|
||||
+
|
||||
+end_cmove_arith_check_and_fail:
|
||||
+ BITMAP_FREE (bba_sets);
|
||||
+ BITMAP_FREE (intersections);
|
||||
+ return false;
|
||||
}
|
||||
|
||||
/* Emit copies of all the active instructions in BB except the last.
|
||||
@@ -2103,6 +2113,142 @@ noce_emit_bb (rtx last_insn, basic_block bb, bool simple)
|
||||
return true;
|
||||
}
|
||||
|
||||
+/* This function tries to rename regs that intersect with considered bb
|
||||
+ inside condition expression. Condition expression will be moved down
|
||||
+ if the optimization will be applied, so it is essential to be sure that
|
||||
+ all intersected registers will be renamed otherwise transformation
|
||||
+ can't be applied. Function returns true if renaming was successful
|
||||
+ and optimization can proceed futher. */
|
||||
+
|
||||
+static bool
|
||||
+noce_rename_regs_in_cond (struct noce_if_info *if_info, bitmap cond_rename_regs)
|
||||
+{
|
||||
+ bool success = true;
|
||||
+ if (bitmap_empty_p (cond_rename_regs))
|
||||
+ return true;
|
||||
+ if (param_ifcvt_allow_register_renaming < 2)
|
||||
+ return false;
|
||||
+ df_ref use;
|
||||
+ rtx_insn *cmp_insn = if_info->cond_earliest;
|
||||
+ /* Jump instruction as a condion currently unsupported. */
|
||||
+ if (JUMP_P (cmp_insn))
|
||||
+ return false;
|
||||
+ rtx_insn *before_cmp = PREV_INSN (cmp_insn);
|
||||
+ start_sequence ();
|
||||
+ rtx_insn *copy_of_cmp = as_a <rtx_insn *> (copy_rtx (cmp_insn));
|
||||
+ basic_block cmp_block = BLOCK_FOR_INSN (cmp_insn);
|
||||
+ FOR_EACH_INSN_USE (use, cmp_insn)
|
||||
+ {
|
||||
+ if (bitmap_bit_p (cond_rename_regs, DF_REF_REGNO (use)))
|
||||
+ {
|
||||
+ rtx use_reg = DF_REF_REG (use);
|
||||
+ rtx tmp = gen_reg_rtx (GET_MODE (use_reg));
|
||||
+ if (!validate_replace_rtx (use_reg, tmp, copy_of_cmp))
|
||||
+ {
|
||||
+ end_sequence ();
|
||||
+ return false;
|
||||
+ }
|
||||
+ noce_emit_move_insn (tmp, use_reg);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ emit_insn (PATTERN (copy_of_cmp));
|
||||
+ rtx_insn *seq = get_insns ();
|
||||
+ unshare_all_rtl_in_chain (seq);
|
||||
+ end_sequence ();
|
||||
+
|
||||
+ emit_insn_after_setloc (seq, before_cmp, INSN_LOCATION (cmp_insn));
|
||||
+ delete_insn_and_edges (cmp_insn);
|
||||
+ rtx_insn *insn;
|
||||
+ FOR_BB_INSNS (cmp_block, insn)
|
||||
+ df_insn_rescan (insn);
|
||||
+
|
||||
+ if_info->cond = noce_get_condition (if_info->jump,
|
||||
+ ©_of_cmp,
|
||||
+ if_info->then_else_reversed);
|
||||
+ if_info->cond_earliest = copy_of_cmp;
|
||||
+ if_info->rev_cond = NULL_RTX;
|
||||
+
|
||||
+ return success;
|
||||
+}
|
||||
+
|
||||
+/* This function tries to rename regs that intersect with considered bb.
|
||||
+ return true if the renaming was successful and optimization can
|
||||
+ proceed futher, false otherwise. */
|
||||
+static bool
|
||||
+noce_rename_regs_in_bb (basic_block test_bb, bitmap rename_regs)
|
||||
+{
|
||||
+ if (bitmap_empty_p (rename_regs))
|
||||
+ return true;
|
||||
+ rtx_insn *insn;
|
||||
+ rtx_insn *last_insn = last_active_insn (test_bb, FALSE);
|
||||
+ bool res = true;
|
||||
+ start_sequence ();
|
||||
+ FOR_BB_INSNS (test_bb, insn)
|
||||
+ {
|
||||
+ if (!active_insn_p (insn))
|
||||
+ continue;
|
||||
+ /* Only ssets are supported for now. */
|
||||
+ rtx sset = single_set (insn);
|
||||
+ gcc_assert (sset);
|
||||
+ rtx x = SET_DEST (sset);
|
||||
+ if (!REG_P (x) || !bitmap_bit_p (rename_regs, REGNO (x)))
|
||||
+ continue;
|
||||
+ /* Do not need to rename dest in the last instruction
|
||||
+ it will be renamed anyway. */
|
||||
+ if (insn == last_insn)
|
||||
+ continue;
|
||||
+ machine_mode mode = GET_MODE (x);
|
||||
+ rtx tmp = gen_reg_rtx (mode);
|
||||
+ if (!validate_replace_rtx_part (x, tmp, &SET_DEST (sset), insn))
|
||||
+ {
|
||||
+ gcc_assert (insn != last_insn);
|
||||
+ /* We can generate additional move for such case,
|
||||
+ but it will increase register preasure.
|
||||
+ For now just stop transformation. */
|
||||
+ rtx result_rtx = SET_DEST (single_set (last_insn));
|
||||
+ if (REG_P (result_rtx) && (x != result_rtx))
|
||||
+ {
|
||||
+ res = false;
|
||||
+ break;
|
||||
+ }
|
||||
+ if (!validate_replace_rtx (x, tmp, insn))
|
||||
+ gcc_unreachable ();
|
||||
+ noce_emit_move_insn (tmp,x);
|
||||
+ }
|
||||
+ set_used_flags (insn);
|
||||
+ rtx_insn *rename_candidate;
|
||||
+ for (rename_candidate = NEXT_INSN (insn);
|
||||
+ rename_candidate && rename_candidate!= NEXT_INSN (BB_END (test_bb));
|
||||
+ rename_candidate = NEXT_INSN (rename_candidate))
|
||||
+ {
|
||||
+ if (!reg_overlap_mentioned_p (x, rename_candidate))
|
||||
+ continue;
|
||||
+
|
||||
+ int replace_res = TRUE;
|
||||
+ if (rename_candidate == last_insn)
|
||||
+ {
|
||||
+ validate_replace_src_group (x, tmp, rename_candidate);
|
||||
+ replace_res = apply_change_group ();
|
||||
+ }
|
||||
+ else
|
||||
+ replace_res = validate_replace_rtx (x, tmp, rename_candidate);
|
||||
+ gcc_assert (replace_res);
|
||||
+ set_used_flags (rename_candidate);
|
||||
+ }
|
||||
+ set_used_flags (x);
|
||||
+ set_used_flags (tmp);
|
||||
+ }
|
||||
+ rtx_insn *seq = get_insns ();
|
||||
+ unshare_all_rtl_in_chain (seq);
|
||||
+ end_sequence ();
|
||||
+ emit_insn_before_setloc (seq, first_active_insn (test_bb),
|
||||
+ INSN_LOCATION (first_active_insn (test_bb)));
|
||||
+ FOR_BB_INSNS (test_bb, insn)
|
||||
+ df_insn_rescan (insn);
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
/* Try more complex cases involving conditional_move. */
|
||||
|
||||
static int
|
||||
@@ -2185,11 +2331,30 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
|
||||
std::swap (then_bb, else_bb);
|
||||
}
|
||||
}
|
||||
-
|
||||
+ bitmap else_bb_rename_regs = BITMAP_ALLOC (®_obstack);
|
||||
+ bitmap then_bb_rename_regs = BITMAP_ALLOC (®_obstack);
|
||||
if (then_bb && else_bb
|
||||
- && (!bbs_ok_for_cmove_arith (then_bb, else_bb, if_info->orig_x)
|
||||
- || !bbs_ok_for_cmove_arith (else_bb, then_bb, if_info->orig_x)))
|
||||
- return FALSE;
|
||||
+ && (!bbs_ok_for_cmove_arith (then_bb, else_bb,
|
||||
+ if_info->orig_x,
|
||||
+ then_bb_rename_regs)
|
||||
+ || !bbs_ok_for_cmove_arith (else_bb, then_bb,
|
||||
+ if_info->orig_x,
|
||||
+ else_bb_rename_regs)))
|
||||
+ {
|
||||
+ BITMAP_FREE (then_bb_rename_regs);
|
||||
+ BITMAP_FREE (else_bb_rename_regs);
|
||||
+ return FALSE;
|
||||
+ }
|
||||
+ bool prepass_renaming = noce_rename_regs_in_bb (then_bb,
|
||||
+ then_bb_rename_regs)
|
||||
+ && noce_rename_regs_in_bb (else_bb,
|
||||
+ else_bb_rename_regs);
|
||||
+
|
||||
+ BITMAP_FREE (then_bb_rename_regs);
|
||||
+ BITMAP_FREE (else_bb_rename_regs);
|
||||
+
|
||||
+ if (!prepass_renaming)
|
||||
+ return FALSE;
|
||||
|
||||
start_sequence ();
|
||||
|
||||
@@ -3072,7 +3237,8 @@ noce_operand_ok (const_rtx op)
|
||||
|
||||
static bool
|
||||
bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
|
||||
- unsigned int *cost, bool *simple_p)
|
||||
+ unsigned int *cost, bool *simple_p,
|
||||
+ bitmap cond_rename_regs)
|
||||
{
|
||||
if (!test_bb)
|
||||
return false;
|
||||
@@ -3112,8 +3278,9 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
|
||||
rtx_insn *prev_last_insn = PREV_INSN (last_insn);
|
||||
gcc_assert (prev_last_insn);
|
||||
|
||||
- /* For now, disallow setting x multiple times in test_bb. */
|
||||
- if (REG_P (x) && reg_set_between_p (x, first_insn, prev_last_insn))
|
||||
+ if (REG_P (x)
|
||||
+ && reg_set_between_p (x, first_insn, prev_last_insn)
|
||||
+ && param_ifcvt_allow_register_renaming < 1)
|
||||
return false;
|
||||
|
||||
bitmap test_bb_temps = BITMAP_ALLOC (®_obstack);
|
||||
@@ -3125,25 +3292,35 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond,
|
||||
rtx_insn *insn;
|
||||
FOR_BB_INSNS (test_bb, insn)
|
||||
{
|
||||
- if (insn != last_insn)
|
||||
- {
|
||||
- if (!active_insn_p (insn))
|
||||
- continue;
|
||||
+ if (insn == last_insn)
|
||||
+ continue;
|
||||
+ if (!active_insn_p (insn))
|
||||
+ continue;
|
||||
|
||||
- if (!insn_valid_noce_process_p (insn, cc))
|
||||
- goto free_bitmap_and_fail;
|
||||
+ if (!insn_valid_noce_process_p (insn, cc))
|
||||
+ goto free_bitmap_and_fail;
|
||||
|
||||
- rtx sset = single_set (insn);
|
||||
- gcc_assert (sset);
|
||||
+ rtx sset = single_set (insn);
|
||||
+ gcc_assert (sset);
|
||||
|
||||
- if (contains_mem_rtx_p (SET_SRC (sset))
|
||||
- || !REG_P (SET_DEST (sset))
|
||||
- || reg_overlap_mentioned_p (SET_DEST (sset), cond))
|
||||
- goto free_bitmap_and_fail;
|
||||
+ if (contains_mem_rtx_p (SET_SRC (sset))
|
||||
+ || !REG_P (SET_DEST (sset)))
|
||||
+ goto free_bitmap_and_fail;
|
||||
|
||||
- potential_cost += pattern_cost (sset, speed_p);
|
||||
- bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset)));
|
||||
+ if (reg_overlap_mentioned_p (SET_DEST (sset), cond))
|
||||
+ {
|
||||
+ if (param_ifcvt_allow_register_renaming < 1)
|
||||
+ goto free_bitmap_and_fail;
|
||||
+ rtx sset_dest = SET_DEST (sset);
|
||||
+ if (REG_P (sset_dest)
|
||||
+ && (GET_MODE_CLASS (GET_MODE (sset_dest)) != MODE_CC))
|
||||
+ bitmap_set_bit (cond_rename_regs, REGNO (sset_dest));
|
||||
+ else
|
||||
+ goto free_bitmap_and_fail;
|
||||
}
|
||||
+ potential_cost += pattern_cost (sset, speed_p);
|
||||
+ if (SET_DEST (sset) != SET_DEST (last_set))
|
||||
+ bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset)));
|
||||
}
|
||||
|
||||
/* If any of the intermediate results in test_bb are live after test_bb
|
||||
@@ -3777,15 +3954,29 @@ noce_process_if_block (struct noce_if_info *if_info)
|
||||
|
||||
bool speed_p = optimize_bb_for_speed_p (test_bb);
|
||||
unsigned int then_cost = 0, else_cost = 0;
|
||||
+ bitmap cond_rename_regs = BITMAP_ALLOC (®_obstack);
|
||||
if (!bb_valid_for_noce_process_p (then_bb, cond, &then_cost,
|
||||
- &if_info->then_simple))
|
||||
- return false;
|
||||
+ &if_info->then_simple, cond_rename_regs))
|
||||
+ {
|
||||
+ BITMAP_FREE (cond_rename_regs);
|
||||
+ return false;
|
||||
+ }
|
||||
|
||||
if (else_bb
|
||||
&& !bb_valid_for_noce_process_p (else_bb, cond, &else_cost,
|
||||
- &if_info->else_simple))
|
||||
- return false;
|
||||
+ &if_info->else_simple, cond_rename_regs))
|
||||
+ {
|
||||
+ BITMAP_FREE (cond_rename_regs);
|
||||
+ return false;
|
||||
+ }
|
||||
|
||||
+ if (!noce_rename_regs_in_cond (if_info, cond_rename_regs))
|
||||
+ {
|
||||
+ BITMAP_FREE (cond_rename_regs);
|
||||
+ return false;
|
||||
+ }
|
||||
+ BITMAP_FREE (cond_rename_regs);
|
||||
+ cond = if_info->cond;
|
||||
if (speed_p)
|
||||
if_info->original_cost += average_cost (then_cost, else_cost,
|
||||
find_edge (test_bb, then_bb));
|
||||
@@ -5823,12 +6014,13 @@ if_convert (bool after_combine)
|
||||
{
|
||||
basic_block bb;
|
||||
int pass;
|
||||
-
|
||||
if (optimize == 1)
|
||||
{
|
||||
df_live_add_problem ();
|
||||
df_live_set_all_dirty ();
|
||||
}
|
||||
+ free_dominance_info (CDI_DOMINATORS);
|
||||
+ cleanup_cfg (CLEANUP_EXPENSIVE);
|
||||
|
||||
/* Record whether we are after combine pass. */
|
||||
ifcvt_after_combine = after_combine;
|
||||
@@ -5933,7 +6125,6 @@ rest_of_handle_if_conversion (void)
|
||||
dump_reg_info (dump_file);
|
||||
dump_flow_info (dump_file, dump_flags);
|
||||
}
|
||||
- cleanup_cfg (CLEANUP_EXPENSIVE);
|
||||
if_convert (false);
|
||||
if (num_updated_if_blocks)
|
||||
/* Get rid of any dead CC-related instructions. */
|
||||
diff --git a/gcc/params.opt b/gcc/params.opt
|
||||
index d2196dc68..ba87f820b 100644
|
||||
--- a/gcc/params.opt
|
||||
+++ b/gcc/params.opt
|
||||
@@ -669,6 +669,10 @@ Maximum permissible cost for the sequence that would be generated by the RTL if-
|
||||
Common Joined UInteger Var(param_max_rtl_if_conversion_unpredictable_cost) Init(40) IntegerRange(0, 200) Param Optimization
|
||||
Maximum permissible cost for the sequence that would be generated by the RTL if-conversion pass for a branch that is considered unpredictable.
|
||||
|
||||
+-param=ifcvt-allow-register-renaming=
|
||||
+Common Joined UInteger Var(param_ifcvt_allow_register_renaming) IntegerRange(0, 2) Param Optimization
|
||||
+Allow RTL if-conversion pass to aggressively rename registers in basic blocks. Sometimes additional moves will be created.
|
||||
+
|
||||
-param=max-sched-extend-regions-iters=
|
||||
Common Joined UInteger Var(param_max_sched_extend_regions_iters) Param Optimization
|
||||
The maximum number of iterations through CFG to extend regions.
|
||||
diff --git a/gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c b/gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c
|
||||
new file mode 100644
|
||||
index 000000000..65c4d4140
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c
|
||||
@@ -0,0 +1,35 @@
|
||||
+
|
||||
+extern void abort(void);
|
||||
+
|
||||
+__attribute__ ((noinline))
|
||||
+int foo (int x, int y, int z, int a, int b)
|
||||
+{
|
||||
+ if (a < 2) {
|
||||
+ if (a == 0) {
|
||||
+ if (x - y < 0)
|
||||
+ x = x - y + z;
|
||||
+ else
|
||||
+ x = x - y;
|
||||
+ }
|
||||
+ else {
|
||||
+ if (x + y >= z)
|
||||
+ x = x + y - z;
|
||||
+ else
|
||||
+ x = x + y;
|
||||
+ }
|
||||
+ }
|
||||
+ return x;
|
||||
+}
|
||||
+
|
||||
+int main(void) {
|
||||
+ if (foo (5,10,7,0,1) != 2) // x - y + z = -5 + 7 = 2
|
||||
+ abort ();
|
||||
+ if (foo (50,10,7,0,1) != 40) // x - y = 40
|
||||
+ abort ();
|
||||
+ if (foo (5,10,7,1,1) != 8) // x + y - z = 5 + 10 - 7 = 8
|
||||
+ abort ();
|
||||
+ if (foo (5,10,70,1,1) != 15) // x + y = 15
|
||||
+ abort ();
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
diff --git a/gcc/testsuite/gcc.dg/ifcvt-6.c b/gcc/testsuite/gcc.dg/ifcvt-6.c
|
||||
new file mode 100644
|
||||
index 000000000..be9a67b3f
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.dg/ifcvt-6.c
|
||||
@@ -0,0 +1,27 @@
|
||||
+/* { dg-do compile { target { aarch64*-*-* } } } */
|
||||
+/* { dg-options "-fdump-rtl-ce1 -O2 --param max-rtl-if-conversion-unpredictable-cost=100 --param max-rtl-if-conversion-predictable-cost=100 --param=ifcvt-allow-register-renaming=2 -fifcvt-allow-complicated-cmps" } */
|
||||
+
|
||||
+typedef unsigned int uint16_t;
|
||||
+
|
||||
+uint16_t
|
||||
+foo (uint16_t x, uint16_t y, uint16_t z, uint16_t a,
|
||||
+ uint16_t b, uint16_t c, uint16_t d) {
|
||||
+ int i = 1;
|
||||
+ int j = 1;
|
||||
+ if (a > b) {
|
||||
+ j = x;
|
||||
+ if (b > c)
|
||||
+ i = y;
|
||||
+ else
|
||||
+ i = z;
|
||||
+ }
|
||||
+ else {
|
||||
+ j = y;
|
||||
+ if (c > d)
|
||||
+ i = z;
|
||||
+ }
|
||||
+ return i * j;
|
||||
+}
|
||||
+
|
||||
+/* { dg-final { scan-rtl-dump "7 true changes made" "ce1" } } */
|
||||
+
|
||||
--
|
||||
2.33.0
|
||||
|
||||
109
0037-Perform-early-if-conversion-of-simple-arithmetic.patch
Normal file
109
0037-Perform-early-if-conversion-of-simple-arithmetic.patch
Normal file
@ -0,0 +1,109 @@
|
||||
From 310eade1450995b55d9f8120561022fbf164b2ec Mon Sep 17 00:00:00 2001
|
||||
From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
|
||||
Date: Thu, 12 Jan 2023 14:52:49 +0300
|
||||
Subject: [PATCH 03/18] Perform early if-conversion of simple arithmetic
|
||||
|
||||
---
|
||||
gcc/common.opt | 4 ++++
|
||||
gcc/match.pd | 25 +++++++++++++++++++
|
||||
gcc/testsuite/gcc.dg/ifcvt-gimple.c | 37 +++++++++++++++++++++++++++++
|
||||
3 files changed, 66 insertions(+)
|
||||
create mode 100644 gcc/testsuite/gcc.dg/ifcvt-gimple.c
|
||||
|
||||
diff --git a/gcc/common.opt b/gcc/common.opt
|
||||
index aa00fb7b0..dac477c04 100644
|
||||
--- a/gcc/common.opt
|
||||
+++ b/gcc/common.opt
|
||||
@@ -1821,6 +1821,10 @@ fif-conversion2
|
||||
Common Var(flag_if_conversion2) Optimization
|
||||
Perform conversion of conditional jumps to conditional execution.
|
||||
|
||||
+fif-conversion-gimple
|
||||
+Common Var(flag_if_conversion_gimple) Optimization
|
||||
+Perform conversion of conditional jumps to branchless equivalents during gimple transformations.
|
||||
+
|
||||
fstack-reuse=
|
||||
Common Joined RejectNegative Enum(stack_reuse_level) Var(flag_stack_reuse) Init(SR_ALL) Optimization
|
||||
-fstack-reuse=[all|named_vars|none] Set stack reuse level for local variables.
|
||||
diff --git a/gcc/match.pd b/gcc/match.pd
|
||||
index 6f24d5079..3cbaf2a5b 100644
|
||||
--- a/gcc/match.pd
|
||||
+++ b/gcc/match.pd
|
||||
@@ -4278,6 +4278,31 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
|
||||
)
|
||||
)
|
||||
)
|
||||
+
|
||||
+(if (flag_if_conversion_gimple)
|
||||
+ (for simple_op (plus minus bit_and bit_ior bit_xor)
|
||||
+ (simplify
|
||||
+ (cond @0 (simple_op @1 INTEGER_CST@2) @1)
|
||||
+ (switch
|
||||
+ /* a = cond ? a + 1 : a -> a = a + ((int) cond) */
|
||||
+ (if (integer_onep (@2))
|
||||
+ (simple_op @1 (convert (convert:boolean_type_node @0))))
|
||||
+ /* a = cond ? a + powerof2cst : a ->
|
||||
+ a = a + ((int) cond) << log2 (powerof2cst) */
|
||||
+ (if (INTEGRAL_TYPE_P (type) && integer_pow2p (@2))
|
||||
+ (with
|
||||
+ {
|
||||
+ tree shift = build_int_cst (integer_type_node, tree_log2 (@2));
|
||||
+ }
|
||||
+ (simple_op @1 (lshift (convert (convert:boolean_type_node @0))
|
||||
+ { shift; })
|
||||
+ )
|
||||
+ )
|
||||
+ )
|
||||
+ )
|
||||
+ )
|
||||
+ )
|
||||
+)
|
||||
#endif
|
||||
|
||||
#if GIMPLE
|
||||
diff --git a/gcc/testsuite/gcc.dg/ifcvt-gimple.c b/gcc/testsuite/gcc.dg/ifcvt-gimple.c
|
||||
new file mode 100644
|
||||
index 000000000..0f7c87e5c
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.dg/ifcvt-gimple.c
|
||||
@@ -0,0 +1,37 @@
|
||||
+/* { dg-do compile } */
|
||||
+/* { dg-options "-O2 -fif-conversion-gimple -fdump-tree-optimized" } */
|
||||
+
|
||||
+int test_int (int optimizable_int) {
|
||||
+ if (optimizable_int > 5)
|
||||
+ ++optimizable_int;
|
||||
+ return optimizable_int;
|
||||
+}
|
||||
+
|
||||
+int test_int_pow2 (int optimizable_int_pow2) {
|
||||
+ if (optimizable_int_pow2 <= 4)
|
||||
+ optimizable_int_pow2 += 1024;
|
||||
+ return optimizable_int_pow2;
|
||||
+}
|
||||
+
|
||||
+int test_int_non_pow2 (int not_optimizable_int_non_pow2) {
|
||||
+ if (not_optimizable_int_non_pow2 == 1)
|
||||
+ not_optimizable_int_non_pow2 += 513;
|
||||
+ return not_optimizable_int_non_pow2;
|
||||
+}
|
||||
+
|
||||
+float test_float (float not_optimizable_float) {
|
||||
+ if (not_optimizable_float > 5)
|
||||
+ not_optimizable_float += 1;
|
||||
+ return not_optimizable_float;
|
||||
+}
|
||||
+
|
||||
+/* Expecting if-else block in test_float and test_int_non_pow2 only. */
|
||||
+/* { dg-final { scan-tree-dump-not "if \\(optimizable" "optimized" } } */
|
||||
+/* { dg-final { scan-tree-dump "if \\(not_optimizable_int_non_pow2" "optimized" } } */
|
||||
+/* { dg-final { scan-tree-dump "if \\(not_optimizable_float" "optimized" } } */
|
||||
+/* { dg-final { scan-tree-dump-times "if " 2 "optimized" } } */
|
||||
+/* { dg-final { scan-tree-dump-times "else" 2 "optimized" } } */
|
||||
+
|
||||
+/* Expecting shifted result only for optimizable_int_pow2. */
|
||||
+/* { dg-final { scan-tree-dump-times " << " 1 "optimized" } } */
|
||||
+/* { dg-final { scan-tree-dump " << 10;" "optimized" } } */
|
||||
--
|
||||
2.33.0
|
||||
|
||||
252
0038-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch
Normal file
252
0038-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch
Normal file
@ -0,0 +1,252 @@
|
||||
From 6684509e81e4341675c73a7dc853180229a8abcb Mon Sep 17 00:00:00 2001
|
||||
From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
|
||||
Date: Tue, 24 Jan 2023 16:43:40 +0300
|
||||
Subject: [PATCH 04/18] Add option to allow matching uaddsub overflow for widen
|
||||
ops too.
|
||||
|
||||
---
|
||||
gcc/common.opt | 5 ++
|
||||
gcc/testsuite/gcc.dg/uaddsub.c | 143 +++++++++++++++++++++++++++++++++
|
||||
gcc/tree-ssa-math-opts.cc | 43 ++++++++--
|
||||
3 files changed, 184 insertions(+), 7 deletions(-)
|
||||
create mode 100644 gcc/testsuite/gcc.dg/uaddsub.c
|
||||
|
||||
diff --git a/gcc/common.opt b/gcc/common.opt
|
||||
index dac477c04..39c90604e 100644
|
||||
--- a/gcc/common.opt
|
||||
+++ b/gcc/common.opt
|
||||
@@ -3106,6 +3106,11 @@ freciprocal-math
|
||||
Common Var(flag_reciprocal_math) SetByCombined Optimization
|
||||
Same as -fassociative-math for expressions which include division.
|
||||
|
||||
+fuaddsub-overflow-match-all
|
||||
+Common Var(flag_uaddsub_overflow_match_all)
|
||||
+Match unsigned add/sub overflow even if the target does not support
|
||||
+the corresponding instruction.
|
||||
+
|
||||
; Nonzero means that unsafe floating-point math optimizations are allowed
|
||||
; for the sake of speed. IEEE compliance is not guaranteed, and operations
|
||||
; are allowed to assume that their arguments and results are "normal"
|
||||
diff --git a/gcc/testsuite/gcc.dg/uaddsub.c b/gcc/testsuite/gcc.dg/uaddsub.c
|
||||
new file mode 100644
|
||||
index 000000000..96c26d308
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.dg/uaddsub.c
|
||||
@@ -0,0 +1,143 @@
|
||||
+/* { dg-do compile } */
|
||||
+/* { dg-options "-O2 -fuaddsub-overflow-match-all -fdump-tree-optimized" } */
|
||||
+#include <stdint.h>
|
||||
+
|
||||
+typedef unsigned __int128 uint128_t;
|
||||
+typedef struct uint256_t
|
||||
+{
|
||||
+ uint128_t lo;
|
||||
+ uint128_t hi;
|
||||
+} uint256_t;
|
||||
+
|
||||
+uint16_t add16 (uint8_t a, uint8_t b)
|
||||
+{
|
||||
+ uint8_t tmp = a + b;
|
||||
+ uint8_t overflow = 0;
|
||||
+ if (tmp < a)
|
||||
+ overflow = 1;
|
||||
+
|
||||
+ uint16_t res = overflow;
|
||||
+ res <<= 8;
|
||||
+ res += tmp;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint32_t add32 (uint16_t a, uint16_t b)
|
||||
+{
|
||||
+ uint16_t tmp = a + b;
|
||||
+ uint16_t overflow = 0;
|
||||
+ if (tmp < a)
|
||||
+ overflow = 1;
|
||||
+
|
||||
+ uint32_t res = overflow;
|
||||
+ res <<= 16;
|
||||
+ res += tmp;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint64_t add64 (uint32_t a, uint32_t b)
|
||||
+{
|
||||
+ uint32_t tmp = a + b;
|
||||
+ uint32_t overflow = 0;
|
||||
+ if (tmp < a)
|
||||
+ overflow = 1;
|
||||
+
|
||||
+ uint64_t res = overflow;
|
||||
+ res <<= 32;
|
||||
+ res += tmp;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint128_t add128 (uint64_t a, uint64_t b)
|
||||
+{
|
||||
+ uint64_t tmp = a + b;
|
||||
+ uint64_t overflow = 0;
|
||||
+ if (tmp < a)
|
||||
+ overflow = 1;
|
||||
+
|
||||
+ uint128_t res = overflow;
|
||||
+ res <<= 64;
|
||||
+ res += tmp;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint256_t add256 (uint128_t a, uint128_t b)
|
||||
+{
|
||||
+ uint128_t tmp = a + b;
|
||||
+ uint128_t overflow = 0;
|
||||
+ if (tmp < a)
|
||||
+ overflow = 1;
|
||||
+
|
||||
+ uint256_t res;
|
||||
+ res.hi = overflow;
|
||||
+ res.lo = tmp;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint16_t sub16 (uint8_t a, uint8_t b)
|
||||
+{
|
||||
+ uint8_t tmp = a - b;
|
||||
+ uint8_t overflow = 0;
|
||||
+ if (tmp > a)
|
||||
+ overflow = -1;
|
||||
+
|
||||
+ uint16_t res = overflow;
|
||||
+ res <<= 8;
|
||||
+ res += tmp;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint32_t sub32 (uint16_t a, uint16_t b)
|
||||
+{
|
||||
+ uint16_t tmp = a - b;
|
||||
+ uint16_t overflow = 0;
|
||||
+ if (tmp > a)
|
||||
+ overflow = -1;
|
||||
+
|
||||
+ uint32_t res = overflow;
|
||||
+ res <<= 16;
|
||||
+ res += tmp;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint64_t sub64 (uint32_t a, uint32_t b)
|
||||
+{
|
||||
+ uint32_t tmp = a - b;
|
||||
+ uint32_t overflow = 0;
|
||||
+ if (tmp > a)
|
||||
+ overflow = -1;
|
||||
+
|
||||
+ uint64_t res = overflow;
|
||||
+ res <<= 32;
|
||||
+ res += tmp;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint128_t sub128 (uint64_t a, uint64_t b)
|
||||
+{
|
||||
+ uint64_t tmp = a - b;
|
||||
+ uint64_t overflow = 0;
|
||||
+ if (tmp > a)
|
||||
+ overflow = -1;
|
||||
+
|
||||
+ uint128_t res = overflow;
|
||||
+ res <<= 64;
|
||||
+ res += tmp;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint256_t sub256 (uint128_t a, uint128_t b)
|
||||
+{
|
||||
+ uint128_t tmp = a - b;
|
||||
+ uint128_t overflow = 0;
|
||||
+ if (tmp > a)
|
||||
+ overflow = -1;
|
||||
+
|
||||
+ uint256_t res;
|
||||
+ res.hi = overflow;
|
||||
+ res.lo = tmp;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+/* { dg-final { scan-tree-dump-times "= .ADD_OVERFLOW \\(a_\[0-9\]+\\(D\\), b_\[0-9\]+\\(D\\)\\)" 5 "optimized" } } */
|
||||
+/* { dg-final { scan-tree-dump-times "= .SUB_OVERFLOW \\(a_\[0-9\]+\\(D\\), b_\[0-9\]+\\(D\\)\\)" 5 "optimized" } } */
|
||||
diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
|
||||
index 232e903b0..55d6ee8ae 100644
|
||||
--- a/gcc/tree-ssa-math-opts.cc
|
||||
+++ b/gcc/tree-ssa-math-opts.cc
|
||||
@@ -3468,6 +3468,27 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
|
||||
}
|
||||
}
|
||||
|
||||
+/* Check if the corresponding operation has wider equivalent on the target. */
|
||||
+
|
||||
+static bool
|
||||
+wider_optab_check_p (optab op, machine_mode mode, int unsignedp)
|
||||
+{
|
||||
+ machine_mode wider_mode;
|
||||
+ FOR_EACH_WIDER_MODE (wider_mode, mode)
|
||||
+ {
|
||||
+ machine_mode next_mode;
|
||||
+ if (optab_handler (op, wider_mode) != CODE_FOR_nothing
|
||||
+ || (op == smul_optab
|
||||
+ && GET_MODE_WIDER_MODE (wider_mode).exists (&next_mode)
|
||||
+ && (find_widening_optab_handler ((unsignedp
|
||||
+ ? umul_widen_optab
|
||||
+ : smul_widen_optab),
|
||||
+ next_mode, mode))))
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
+ return false;
|
||||
+}
|
||||
|
||||
/* Helper function of match_arith_overflow. For MUL_OVERFLOW, if we have
|
||||
a check for non-zero like:
|
||||
@@ -3903,15 +3924,22 @@ match_arith_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
|
||||
|| code == MINUS_EXPR
|
||||
|| code == MULT_EXPR
|
||||
|| code == BIT_NOT_EXPR);
|
||||
+ int unsignedp = TYPE_UNSIGNED (type);
|
||||
if (!INTEGRAL_TYPE_P (type)
|
||||
- || !TYPE_UNSIGNED (type)
|
||||
- || has_zero_uses (lhs)
|
||||
- || (code != PLUS_EXPR
|
||||
- && code != MULT_EXPR
|
||||
- && optab_handler (code == MINUS_EXPR ? usubv4_optab : uaddv4_optab,
|
||||
- TYPE_MODE (type)) == CODE_FOR_nothing))
|
||||
+ || !unsignedp
|
||||
+ || has_zero_uses (lhs))
|
||||
return false;
|
||||
|
||||
+ if (code == PLUS_EXPR || code == MINUS_EXPR)
|
||||
+ {
|
||||
+ machine_mode mode = TYPE_MODE (type);
|
||||
+ optab op = code == PLUS_EXPR ? uaddv4_optab : usubv4_optab;
|
||||
+ if (optab_handler (op, mode) == CODE_FOR_nothing
|
||||
+ && (!flag_uaddsub_overflow_match_all
|
||||
+ || !wider_optab_check_p (op, mode, unsignedp)))
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
tree rhs1 = gimple_assign_rhs1 (stmt);
|
||||
tree rhs2 = gimple_assign_rhs2 (stmt);
|
||||
FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
|
||||
@@ -3986,7 +4014,8 @@ match_arith_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
|
||||
|| (code != MULT_EXPR && (code == BIT_NOT_EXPR ? use_seen : !use_seen))
|
||||
|| (code == PLUS_EXPR
|
||||
&& optab_handler (uaddv4_optab,
|
||||
- TYPE_MODE (type)) == CODE_FOR_nothing)
|
||||
+ TYPE_MODE (type)) == CODE_FOR_nothing
|
||||
+ && !flag_uaddsub_overflow_match_all)
|
||||
|| (code == MULT_EXPR
|
||||
&& optab_handler (cast_stmt ? mulv4_optab : umulv4_optab,
|
||||
TYPE_MODE (type)) == CODE_FOR_nothing))
|
||||
--
|
||||
2.33.0
|
||||
|
||||
488
0039-Match-double-sized-mul-pattern.patch
Normal file
488
0039-Match-double-sized-mul-pattern.patch
Normal file
@ -0,0 +1,488 @@
|
||||
From e7b22f97f960b62e555dfd6f2e3ae43973fcbb3e Mon Sep 17 00:00:00 2001
|
||||
From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
|
||||
Date: Wed, 25 Jan 2023 15:04:07 +0300
|
||||
Subject: [PATCH 05/18] Match double sized mul pattern
|
||||
|
||||
---
|
||||
gcc/match.pd | 136 +++++++++++++++++++++
|
||||
gcc/testsuite/gcc.dg/double_sized_mul-1.c | 141 ++++++++++++++++++++++
|
||||
gcc/testsuite/gcc.dg/double_sized_mul-2.c | 62 ++++++++++
|
||||
gcc/tree-ssa-math-opts.cc | 80 ++++++++++++
|
||||
4 files changed, 419 insertions(+)
|
||||
create mode 100644 gcc/testsuite/gcc.dg/double_sized_mul-1.c
|
||||
create mode 100644 gcc/testsuite/gcc.dg/double_sized_mul-2.c
|
||||
|
||||
diff --git a/gcc/match.pd b/gcc/match.pd
|
||||
index 3cbaf2a5b..61866cb90 100644
|
||||
--- a/gcc/match.pd
|
||||
+++ b/gcc/match.pd
|
||||
@@ -7895,3 +7895,139 @@ and,
|
||||
== TYPE_UNSIGNED (TREE_TYPE (@3))))
|
||||
&& single_use (@4)
|
||||
&& single_use (@5))))
|
||||
+
|
||||
+/* Match multiplication with double sized result.
|
||||
+
|
||||
+ Consider the following calculations:
|
||||
+ arg0 * arg1 = (2^(bit_size/2) * arg0_hi + arg0_lo)
|
||||
+ * (2^(bit_size/2) * arg1_hi + arg1_lo)
|
||||
+ arg0 * arg1 = 2^bit_size * arg0_hi * arg1_hi
|
||||
+ + 2^(bit_size/2) * (arg0_hi * arg1_lo + arg0_lo * arg1_hi)
|
||||
+ + arg0_lo * arg1_lo
|
||||
+
|
||||
+ The products of high and low parts fits in bit_size values, thus they are
|
||||
+ placed in high and low parts of result respectively.
|
||||
+
|
||||
+ The sum of the mixed products may overflow, so we need a detection for that.
|
||||
+ Also it has a bit_size/2 offset, thus it intersects with both high and low
|
||||
+ parts of result. Overflow detection constant is bit_size/2 due to this.
|
||||
+
|
||||
+ With this info:
|
||||
+ arg0 * arg1 = 2^bit_size * arg0_hi * arg1_hi
|
||||
+ + 2^(bit_size/2) * middle
|
||||
+ + 2^bit_size * possible_middle_overflow
|
||||
+ + arg0_lo * arg1_lo
|
||||
+ arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + possible_middle_overflow)
|
||||
+ + 2^(bit_size/2) * (2^(bit_size/2) * middle_hi + middle_lo)
|
||||
+ + arg0_lo * arg1_lo
|
||||
+ arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + middle_hi
|
||||
+ + possible_middle_overflow)
|
||||
+ + 2^(bit_size/2) * middle_lo
|
||||
+ + arg0_lo * arg1_lo
|
||||
+
|
||||
+ The last sum can produce overflow for the high result part. With this:
|
||||
+ arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + possible_middle_overflow
|
||||
+ + possible_res_lo_overflow + middle_hi)
|
||||
+ + res_lo
|
||||
+ = res_hi + res_lo
|
||||
+
|
||||
+ This formula is quite big to fit into one match pattern with all of the
|
||||
+ combinations of terms inside it. There are many helpers for better code
|
||||
+ readability.
|
||||
+
|
||||
+ The simplification basis is res_hi: assuming that res_lo only is not
|
||||
+ real practical case for such calculations.
|
||||
+
|
||||
+ Overflow handling is done via matching complex calculations:
|
||||
+ the realpart and imagpart are quite handy here. */
|
||||
+/* Match low and high parts of the argument. */
|
||||
+(match (double_size_mul_arg_lo @0 @1)
|
||||
+ (bit_and @0 INTEGER_CST@1)
|
||||
+ (if (wi::to_wide (@1)
|
||||
+ == wi::mask (TYPE_PRECISION (type) / 2, false, TYPE_PRECISION (type)))))
|
||||
+(match (double_size_mul_arg_hi @0 @1)
|
||||
+ (rshift @0 INTEGER_CST@1)
|
||||
+ (if (wi::to_wide (@1) == TYPE_PRECISION (type) / 2)))
|
||||
+
|
||||
+/* Match various argument parts products. */
|
||||
+(match (double_size_mul_lolo @0 @1)
|
||||
+ (mult@4 (double_size_mul_arg_lo @0 @2) (double_size_mul_arg_lo @1 @3))
|
||||
+ (if (single_use (@4))))
|
||||
+(match (double_size_mul_hihi @0 @1)
|
||||
+ (mult@4 (double_size_mul_arg_hi @0 @2) (double_size_mul_arg_hi @1 @3))
|
||||
+ (if (single_use (@4))))
|
||||
+(match (double_size_mul_lohi @0 @1)
|
||||
+ (mult:c@4 (double_size_mul_arg_lo @0 @2) (double_size_mul_arg_hi @1 @3))
|
||||
+ (if (single_use (@4))))
|
||||
+
|
||||
+/* Match complex middle sum. */
|
||||
+(match (double_size_mul_middle_complex @0 @1)
|
||||
+ (IFN_ADD_OVERFLOW@2 (double_size_mul_lohi @0 @1) (double_size_mul_lohi @1 @0))
|
||||
+ (if (num_imm_uses (@2) == 2)))
|
||||
+
|
||||
+/* Match real middle results. */
|
||||
+(match (double_size_mul_middle @0 @1)
|
||||
+ (realpart@2 (double_size_mul_middle_complex @0 @1))
|
||||
+ (if (num_imm_uses (@2) == 2)))
|
||||
+(match (double_size_mul_middleres_lo @0 @1)
|
||||
+ (lshift@3 (double_size_mul_middle @0 @1) INTEGER_CST@2)
|
||||
+ (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2
|
||||
+ && single_use (@3))))
|
||||
+(match (double_size_mul_middleres_hi @0 @1)
|
||||
+ (rshift@3 (double_size_mul_middle @0 @1) INTEGER_CST@2)
|
||||
+ (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2
|
||||
+ && single_use (@3))))
|
||||
+
|
||||
+/* Match low result part. */
|
||||
+/* Number of uses may be < 2 in case when we are interested in
|
||||
+ high part only. */
|
||||
+(match (double_size_mul_res_lo_complex @0 @1)
|
||||
+ (IFN_ADD_OVERFLOW:c@2
|
||||
+ (double_size_mul_lolo:c @0 @1) (double_size_mul_middleres_lo @0 @1))
|
||||
+ (if (num_imm_uses (@2) <= 2)))
|
||||
+(match (double_size_mul_res_lo @0 @1)
|
||||
+ (realpart (double_size_mul_res_lo_complex @0 @1)))
|
||||
+
|
||||
+/* Match overflow terms. */
|
||||
+(match (double_size_mul_overflow_check_lo @0 @1 @5)
|
||||
+ (convert@4 (ne@3
|
||||
+ (imagpart@2 (double_size_mul_res_lo_complex@5 @0 @1)) integer_zerop))
|
||||
+ (if (single_use (@2) && single_use (@3) && single_use (@4))))
|
||||
+(match (double_size_mul_overflow_check_hi @0 @1)
|
||||
+ (lshift@6 (convert@5 (ne@4
|
||||
+ (imagpart@3 (double_size_mul_middle_complex @0 @1)) integer_zerop))
|
||||
+ INTEGER_CST@2)
|
||||
+ (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2
|
||||
+ && single_use (@3) && single_use (@4) && single_use (@5)
|
||||
+ && single_use (@6))))
|
||||
+
|
||||
+/* Match all possible permutations for high result part calculations. */
|
||||
+(for op1 (double_size_mul_hihi
|
||||
+ double_size_mul_overflow_check_hi
|
||||
+ double_size_mul_middleres_hi)
|
||||
+ op2 (double_size_mul_overflow_check_hi
|
||||
+ double_size_mul_middleres_hi
|
||||
+ double_size_mul_hihi)
|
||||
+ op3 (double_size_mul_middleres_hi
|
||||
+ double_size_mul_hihi
|
||||
+ double_size_mul_overflow_check_hi)
|
||||
+ (match (double_size_mul_candidate @0 @1 @2 @3)
|
||||
+ (plus:c@2
|
||||
+ (plus:c@4 (double_size_mul_overflow_check_lo @0 @1 @3) (op1:c @0 @1))
|
||||
+ (plus:c@5 (op2:c @0 @1) (op3:c @0 @1)))
|
||||
+ (if (single_use (@4) && single_use (@5))))
|
||||
+ (match (double_size_mul_candidate @0 @1 @2 @3)
|
||||
+ (plus:c@2 (double_size_mul_overflow_check_lo @0 @1 @3)
|
||||
+ (plus:c@4 (op1:c @0 @1)
|
||||
+ (plus:c@5 (op2:c @0 @1) (op3:c @0 @1))))
|
||||
+ (if (single_use (@4) && single_use (@5))))
|
||||
+ (match (double_size_mul_candidate @0 @1 @2 @3)
|
||||
+ (plus:c@2 (op1:c @0 @1)
|
||||
+ (plus:c@4 (double_size_mul_overflow_check_lo @0 @1 @3)
|
||||
+ (plus:c@5 (op2:c @0 @1) (op3:c @0 @1))))
|
||||
+ (if (single_use (@4) && single_use (@5))))
|
||||
+ (match (double_size_mul_candidate @0 @1 @2 @3)
|
||||
+ (plus:c@2 (op1:c @0 @1)
|
||||
+ (plus:c@4 (op2:c @0 @1)
|
||||
+ (plus:c@5 (double_size_mul_overflow_check_lo @0 @1 @3) (op3:c @0 @1))))
|
||||
+ (if (single_use (@4) && single_use (@5)))))
|
||||
diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-1.c b/gcc/testsuite/gcc.dg/double_sized_mul-1.c
|
||||
new file mode 100644
|
||||
index 000000000..4d475cc8a
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.dg/double_sized_mul-1.c
|
||||
@@ -0,0 +1,141 @@
|
||||
+/* { dg-do compile } */
|
||||
+/* fif-conversion-gimple and fuaddsub-overflow-match-all are required for
|
||||
+ proper overflow detection in some cases. */
|
||||
+/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */
|
||||
+#include <stdint.h>
|
||||
+
|
||||
+typedef unsigned __int128 uint128_t;
|
||||
+
|
||||
+uint16_t mul16 (uint8_t a, uint8_t b)
|
||||
+{
|
||||
+ uint8_t a_lo = a & 0xF;
|
||||
+ uint8_t b_lo = b & 0xF;
|
||||
+ uint8_t a_hi = a >> 4;
|
||||
+ uint8_t b_hi = b >> 4;
|
||||
+ uint8_t lolo = a_lo * b_lo;
|
||||
+ uint8_t lohi = a_lo * b_hi;
|
||||
+ uint8_t hilo = a_hi * b_lo;
|
||||
+ uint8_t hihi = a_hi * b_hi;
|
||||
+ uint8_t middle = hilo + lohi;
|
||||
+ uint8_t middle_hi = middle >> 4;
|
||||
+ uint8_t middle_lo = middle << 4;
|
||||
+ uint8_t res_lo = lolo + middle_lo;
|
||||
+ uint8_t res_hi = hihi + middle_hi;
|
||||
+ res_hi += (res_lo < middle_lo ? 1 : 0);
|
||||
+ res_hi += (middle < hilo ? 0x10 : 0);
|
||||
+ uint16_t res = ((uint16_t) res_hi) << 8;
|
||||
+ res += res_lo;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint32_t mul32 (uint16_t a, uint16_t b)
|
||||
+{
|
||||
+ uint16_t a_lo = a & 0xFF;
|
||||
+ uint16_t b_lo = b & 0xFF;
|
||||
+ uint16_t a_hi = a >> 8;
|
||||
+ uint16_t b_hi = b >> 8;
|
||||
+ uint16_t lolo = a_lo * b_lo;
|
||||
+ uint16_t lohi = a_lo * b_hi;
|
||||
+ uint16_t hilo = a_hi * b_lo;
|
||||
+ uint16_t hihi = a_hi * b_hi;
|
||||
+ uint16_t middle = hilo + lohi;
|
||||
+ uint16_t middle_hi = middle >> 8;
|
||||
+ uint16_t middle_lo = middle << 8;
|
||||
+ uint16_t res_lo = lolo + middle_lo;
|
||||
+ uint16_t res_hi = hihi + middle_hi;
|
||||
+ res_hi += (res_lo < middle_lo ? 1 : 0);
|
||||
+ res_hi += (middle < hilo ? 0x100 : 0);
|
||||
+ uint32_t res = ((uint32_t) res_hi) << 16;
|
||||
+ res += res_lo;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint64_t mul64 (uint32_t a, uint32_t b)
|
||||
+{
|
||||
+ uint32_t a_lo = a & 0xFFFF;
|
||||
+ uint32_t b_lo = b & 0xFFFF;
|
||||
+ uint32_t a_hi = a >> 16;
|
||||
+ uint32_t b_hi = b >> 16;
|
||||
+ uint32_t lolo = a_lo * b_lo;
|
||||
+ uint32_t lohi = a_lo * b_hi;
|
||||
+ uint32_t hilo = a_hi * b_lo;
|
||||
+ uint32_t hihi = a_hi * b_hi;
|
||||
+ uint32_t middle = hilo + lohi;
|
||||
+ uint32_t middle_hi = middle >> 16;
|
||||
+ uint32_t middle_lo = middle << 16;
|
||||
+ uint32_t res_lo = lolo + middle_lo;
|
||||
+ uint32_t res_hi = hihi + middle_hi;
|
||||
+ res_hi += (res_lo < middle_lo ? 1 : 0);
|
||||
+ res_hi += (middle < hilo ? 0x10000 : 0);
|
||||
+ uint64_t res = ((uint64_t) res_hi) << 32;
|
||||
+ res += res_lo;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint128_t mul128 (uint64_t a, uint64_t b)
|
||||
+{
|
||||
+ uint64_t a_lo = a & 0xFFFFFFFF;
|
||||
+ uint64_t b_lo = b & 0xFFFFFFFF;
|
||||
+ uint64_t a_hi = a >> 32;
|
||||
+ uint64_t b_hi = b >> 32;
|
||||
+ uint64_t lolo = a_lo * b_lo;
|
||||
+ uint64_t lohi = a_lo * b_hi;
|
||||
+ uint64_t hilo = a_hi * b_lo;
|
||||
+ uint64_t hihi = a_hi * b_hi;
|
||||
+ uint64_t middle = hilo + lohi;
|
||||
+ uint64_t middle_hi = middle >> 32;
|
||||
+ uint64_t middle_lo = middle << 32;
|
||||
+ uint64_t res_lo = lolo + middle_lo;
|
||||
+ uint64_t res_hi = hihi + middle_hi;
|
||||
+ res_hi += (res_lo < middle_lo ? 1 : 0);
|
||||
+ res_hi += (middle < hilo ? 0x100000000 : 0);
|
||||
+ uint128_t res = ((uint128_t) res_hi) << 64;
|
||||
+ res += res_lo;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint64_t mul64_perm (uint32_t a, uint32_t b)
|
||||
+{
|
||||
+ uint32_t a_lo = a & 0xFFFF;
|
||||
+ uint32_t b_lo = b & 0xFFFF;
|
||||
+ uint32_t a_hi = a >> 16;
|
||||
+ uint32_t b_hi = b >> 16;
|
||||
+ uint32_t lolo = a_lo * b_lo;
|
||||
+ uint32_t lohi = a_lo * b_hi;
|
||||
+ uint32_t hilo = a_hi * b_lo;
|
||||
+ uint32_t hihi = a_hi * b_hi;
|
||||
+ uint32_t middle = hilo + lohi;
|
||||
+ uint32_t middle_hi = middle >> 16;
|
||||
+ uint32_t middle_lo = middle << 16;
|
||||
+ uint32_t res_lo = lolo + middle_lo;
|
||||
+ uint32_t res_hi = hihi + middle_hi;
|
||||
+ res_hi = res_lo < middle_lo ? res_hi + 1 : res_hi;
|
||||
+ res_hi = middle < hilo ? res_hi + 0x10000 : res_hi;
|
||||
+ uint64_t res = ((uint64_t) res_hi) << 32;
|
||||
+ res += res_lo;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+uint128_t mul128_perm (uint64_t a, uint64_t b)
|
||||
+{
|
||||
+ uint64_t a_lo = a & 0xFFFFFFFF;
|
||||
+ uint64_t b_lo = b & 0xFFFFFFFF;
|
||||
+ uint64_t a_hi = a >> 32;
|
||||
+ uint64_t b_hi = b >> 32;
|
||||
+ uint64_t lolo = a_lo * b_lo;
|
||||
+ uint64_t lohi = a_lo * b_hi;
|
||||
+ uint64_t hilo = a_hi * b_lo;
|
||||
+ uint64_t hihi = a_hi * b_hi;
|
||||
+ uint64_t middle = hilo + lohi;
|
||||
+ uint64_t middle_hi = middle >> 32;
|
||||
+ uint64_t middle_lo = middle << 32;
|
||||
+ uint64_t res_lo = lolo + middle_lo;
|
||||
+ uint64_t res_hi = hihi + middle_hi;
|
||||
+ res_hi = res_lo < middle_lo ? res_hi + 1 : res_hi;
|
||||
+ res_hi = middle < hilo ? res_hi + 0x100000000 : res_hi;
|
||||
+ uint128_t res = ((uint128_t) res_hi) << 64;
|
||||
+ res += res_lo;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+/* { dg-final { scan-tree-dump-times "double sized mul optimized: 1" 6 "widening_mul" } } */
|
||||
diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-2.c b/gcc/testsuite/gcc.dg/double_sized_mul-2.c
|
||||
new file mode 100644
|
||||
index 000000000..cc6e5af25
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.dg/double_sized_mul-2.c
|
||||
@@ -0,0 +1,62 @@
|
||||
+/* { dg-do compile } */
|
||||
+/* fif-conversion-gimple is required for proper overflow detection
|
||||
+ in some cases. */
|
||||
+/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */
|
||||
+#include <stdint.h>
|
||||
+
|
||||
+typedef unsigned __int128 uint128_t;
|
||||
+typedef struct uint256_t
|
||||
+{
|
||||
+ uint128_t lo;
|
||||
+ uint128_t hi;
|
||||
+} uint256_t;
|
||||
+
|
||||
+uint64_t mul64_double_use (uint32_t a, uint32_t b)
|
||||
+{
|
||||
+ uint32_t a_lo = a & 0xFFFF;
|
||||
+ uint32_t b_lo = b & 0xFFFF;
|
||||
+ uint32_t a_hi = a >> 16;
|
||||
+ uint32_t b_hi = b >> 16;
|
||||
+ uint32_t lolo = a_lo * b_lo;
|
||||
+ uint32_t lohi = a_lo * b_hi;
|
||||
+ uint32_t hilo = a_hi * b_lo;
|
||||
+ uint32_t hihi = a_hi * b_hi;
|
||||
+ uint32_t middle = hilo + lohi;
|
||||
+ uint32_t middle_hi = middle >> 16;
|
||||
+ uint32_t middle_lo = middle << 16;
|
||||
+ uint32_t res_lo = lolo + middle_lo;
|
||||
+ uint32_t res_hi = hihi + middle_hi;
|
||||
+ res_hi += (res_lo < middle_lo ? 1 : 0);
|
||||
+ res_hi += (middle < hilo ? 0x10000 : 0);
|
||||
+ uint64_t res = ((uint64_t) res_hi) << 32;
|
||||
+ res += res_lo;
|
||||
+ return res + lolo;
|
||||
+}
|
||||
+
|
||||
+uint256_t mul256 (uint128_t a, uint128_t b)
|
||||
+{
|
||||
+ uint128_t a_lo = a & 0xFFFFFFFFFFFFFFFF;
|
||||
+ uint128_t b_lo = b & 0xFFFFFFFFFFFFFFFF;
|
||||
+ uint128_t a_hi = a >> 64;
|
||||
+ uint128_t b_hi = b >> 64;
|
||||
+ uint128_t lolo = a_lo * b_lo;
|
||||
+ uint128_t lohi = a_lo * b_hi;
|
||||
+ uint128_t hilo = a_hi * b_lo;
|
||||
+ uint128_t hihi = a_hi * b_hi;
|
||||
+ uint128_t middle = hilo + lohi;
|
||||
+ uint128_t middle_hi = middle >> 64;
|
||||
+ uint128_t middle_lo = middle << 64;
|
||||
+ uint128_t res_lo = lolo + middle_lo;
|
||||
+ uint128_t res_hi = hihi + middle_hi;
|
||||
+ res_hi += (res_lo < middle_lo ? 1 : 0);
|
||||
+ /* Constant is to big warning WA */
|
||||
+ uint128_t overflow_tmp = (middle < hilo ? 1 : 0);
|
||||
+ overflow_tmp <<= 64;
|
||||
+ res_hi += overflow_tmp;
|
||||
+ uint256_t res;
|
||||
+ res.lo = res_lo;
|
||||
+ res.hi = res_hi;
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+/* { dg-final { scan-tree-dump-not "double sized mul optimized" "widening_mul" } } */
|
||||
diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
|
||||
index 55d6ee8ae..2c06b8a60 100644
|
||||
--- a/gcc/tree-ssa-math-opts.cc
|
||||
+++ b/gcc/tree-ssa-math-opts.cc
|
||||
@@ -210,6 +210,9 @@ static struct
|
||||
|
||||
/* Number of highpart multiplication ops inserted. */
|
||||
int highpart_mults_inserted;
|
||||
+
|
||||
+ /* Number of optimized double sized multiplications. */
|
||||
+ int double_sized_mul_optimized;
|
||||
} widen_mul_stats;
|
||||
|
||||
/* The instance of "struct occurrence" representing the highest
|
||||
@@ -4893,6 +4896,78 @@ optimize_spaceship (gimple *stmt)
|
||||
}
|
||||
|
||||
|
||||
+/* Pattern matcher for double sized multiplication defined in match.pd. */
|
||||
+extern bool gimple_double_size_mul_candidate (tree, tree*, tree (*)(tree));
|
||||
+
|
||||
+static bool
|
||||
+convert_double_size_mul (gimple_stmt_iterator *gsi, gimple *stmt)
|
||||
+{
|
||||
+ gimple *use_stmt, *complex_res_lo;
|
||||
+ gimple_stmt_iterator insert_before;
|
||||
+ imm_use_iterator use_iter;
|
||||
+ tree match[4]; // arg0, arg1, res_hi, complex_res_lo
|
||||
+ tree arg0, arg1, widen_mult, new_type, tmp;
|
||||
+ tree lhs = gimple_assign_lhs (stmt);
|
||||
+ location_t loc = UNKNOWN_LOCATION;
|
||||
+ machine_mode mode;
|
||||
+
|
||||
+ if (!gimple_double_size_mul_candidate (lhs, match, NULL))
|
||||
+ return false;
|
||||
+
|
||||
+ new_type = build_nonstandard_integer_type (
|
||||
+ TYPE_PRECISION (TREE_TYPE (match[0])) * 2, 1);
|
||||
+ mode = TYPE_MODE (new_type);
|
||||
+
|
||||
+ /* Early return if the target multiplication doesn't exist on target. */
|
||||
+ if (optab_handler (smul_optab, mode) == CODE_FOR_nothing
|
||||
+ && !wider_optab_check_p (smul_optab, mode, 1))
|
||||
+ return false;
|
||||
+
|
||||
+ /* Determine the point where the wide multiplication
|
||||
+ should be inserted. Complex low res is OK since it is required
|
||||
+ by both high and low part getters, thus it dominates both of them. */
|
||||
+ complex_res_lo = SSA_NAME_DEF_STMT (match[3]);
|
||||
+ insert_before = gsi_for_stmt (complex_res_lo);
|
||||
+ gsi_next (&insert_before);
|
||||
+
|
||||
+ /* Create the widen multiplication. */
|
||||
+ arg0 = build_and_insert_cast (&insert_before, loc, new_type, match[0]);
|
||||
+ arg1 = build_and_insert_cast (&insert_before, loc, new_type, match[1]);
|
||||
+ widen_mult = build_and_insert_binop (&insert_before, loc, "widen_mult",
|
||||
+ MULT_EXPR, arg0, arg1);
|
||||
+
|
||||
+ /* Find the mult low part getter. */
|
||||
+ FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, match[3])
|
||||
+ if (gimple_assign_rhs_code (use_stmt) == REALPART_EXPR)
|
||||
+ break;
|
||||
+
|
||||
+ /* Create high and low (if needed) parts extractors. */
|
||||
+ /* Low part. */
|
||||
+ if (use_stmt)
|
||||
+ {
|
||||
+ loc = gimple_location (use_stmt);
|
||||
+ tmp = build_and_insert_cast (&insert_before, loc,
|
||||
+ TREE_TYPE (gimple_get_lhs (use_stmt)),
|
||||
+ widen_mult);
|
||||
+ gassign *new_stmt = gimple_build_assign (gimple_get_lhs (use_stmt),
|
||||
+ NOP_EXPR, tmp);
|
||||
+ gsi_replace (&insert_before, new_stmt, true);
|
||||
+ }
|
||||
+
|
||||
+ /* High part. */
|
||||
+ loc = gimple_location (stmt);
|
||||
+ tmp = build_and_insert_binop (gsi, loc, "widen_mult_hi",
|
||||
+ RSHIFT_EXPR, widen_mult,
|
||||
+ build_int_cst (new_type,
|
||||
+ TYPE_PRECISION (new_type) / 2));
|
||||
+ tmp = build_and_insert_cast (gsi, loc, TREE_TYPE (lhs), tmp);
|
||||
+ gassign *new_stmt = gimple_build_assign (lhs, NOP_EXPR, tmp);
|
||||
+ gsi_replace (gsi, new_stmt, true);
|
||||
+
|
||||
+ widen_mul_stats.double_sized_mul_optimized++;
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
/* Find integer multiplications where the operands are extended from
|
||||
smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
|
||||
or MULT_HIGHPART_EXPR where appropriate. */
|
||||
@@ -4987,6 +5062,9 @@ math_opts_dom_walker::after_dom_children (basic_block bb)
|
||||
break;
|
||||
|
||||
case PLUS_EXPR:
|
||||
+ if (convert_double_size_mul (&gsi, stmt))
|
||||
+ break;
|
||||
+ __attribute__ ((fallthrough));
|
||||
case MINUS_EXPR:
|
||||
if (!convert_plusminus_to_widen (&gsi, stmt, code))
|
||||
match_arith_overflow (&gsi, stmt, code, m_cfg_changed_p);
|
||||
@@ -5091,6 +5169,8 @@ pass_optimize_widening_mul::execute (function *fun)
|
||||
widen_mul_stats.divmod_calls_inserted);
|
||||
statistics_counter_event (fun, "highpart multiplications inserted",
|
||||
widen_mul_stats.highpart_mults_inserted);
|
||||
+ statistics_counter_event (fun, "double sized mul optimized",
|
||||
+ widen_mul_stats.double_sized_mul_optimized);
|
||||
|
||||
return cfg_changed ? TODO_cleanup_cfg : 0;
|
||||
}
|
||||
--
|
||||
2.33.0
|
||||
|
||||
2387
0040-Port-icp-patch-to-GCC-12.patch
Normal file
2387
0040-Port-icp-patch-to-GCC-12.patch
Normal file
File diff suppressed because it is too large
Load Diff
100
0041-Port-fixes-in-icp-to-GCC-12.patch
Normal file
100
0041-Port-fixes-in-icp-to-GCC-12.patch
Normal file
@ -0,0 +1,100 @@
|
||||
From aaa117a9ff58fb208e8c8859e075ca425f995f63 Mon Sep 17 00:00:00 2001
|
||||
From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
|
||||
Date: Tue, 27 Feb 2024 07:43:57 +0800
|
||||
Subject: [PATCH 07/18] Port fixes in icp to GCC 12
|
||||
|
||||
---
|
||||
gcc/ipa-devirt.cc | 37 ++++++++++++++++++++++++++++++-------
|
||||
1 file changed, 30 insertions(+), 7 deletions(-)
|
||||
|
||||
diff --git a/gcc/ipa-devirt.cc b/gcc/ipa-devirt.cc
|
||||
index 383839189..318535d06 100644
|
||||
--- a/gcc/ipa-devirt.cc
|
||||
+++ b/gcc/ipa-devirt.cc
|
||||
@@ -4431,6 +4431,11 @@ print_type_set(unsigned ftype_uid, type_alias_map *map)
|
||||
if (!map->count (ftype_uid))
|
||||
return;
|
||||
type_set* s = (*map)[ftype_uid];
|
||||
+ if (!s)
|
||||
+ {
|
||||
+ fprintf (dump_file, "%d (no set)", ftype_uid);
|
||||
+ return;
|
||||
+ }
|
||||
for (type_set::const_iterator it = s->begin (); it != s->end (); it++)
|
||||
fprintf (dump_file, it == s->begin () ? "%d" : ", %d", *it);
|
||||
}
|
||||
@@ -4696,12 +4701,19 @@ maybe_register_aliases (tree type1, tree type2)
|
||||
if (register_ailas_type (type1, type2, ta_map))
|
||||
analyze_pointees (type1, type2);
|
||||
}
|
||||
+ unsigned type1_uid = TYPE_UID (type1);
|
||||
+ unsigned type2_uid = TYPE_UID (type2);
|
||||
+ if (type_uid_map->count (type1_uid) == 0)
|
||||
+ (*type_uid_map)[type1_uid] = type1;
|
||||
+ if (type_uid_map->count (type2_uid) == 0)
|
||||
+ (*type_uid_map)[type2_uid] = type2;
|
||||
+
|
||||
/* If function and non-function type pointers alias,
|
||||
the function type is unsafe. */
|
||||
if (FUNCTION_POINTER_TYPE_P (type1) && !FUNCTION_POINTER_TYPE_P (type2))
|
||||
- unsafe_types->insert (TYPE_UID (type1));
|
||||
+ unsafe_types->insert (type1_uid);
|
||||
if (FUNCTION_POINTER_TYPE_P (type2) && !FUNCTION_POINTER_TYPE_P (type1))
|
||||
- unsafe_types->insert (TYPE_UID (type2));
|
||||
+ unsafe_types->insert (type2_uid);
|
||||
|
||||
/* Try to figure out with pointers to incomplete types. */
|
||||
if (POINTER_TYPE_P (type1) && POINTER_TYPE_P (type2))
|
||||
@@ -4825,10 +4837,12 @@ compare_block_and_init_type (tree block, tree t1)
|
||||
static void
|
||||
analyze_global_var (varpool_node *var)
|
||||
{
|
||||
- var->get_constructor();
|
||||
tree decl = var->decl;
|
||||
- if (TREE_CODE (decl) == SSA_NAME || !DECL_INITIAL (decl)
|
||||
- || integer_zerop (DECL_INITIAL (decl)))
|
||||
+ if (decl || !DECL_INITIAL (decl))
|
||||
+ return;
|
||||
+ var->get_constructor ();
|
||||
+ if (TREE_CODE (decl) == SSA_NAME || integer_zerop (DECL_INITIAL (decl))
|
||||
+ || TREE_CODE (DECL_INITIAL (decl)) == ERROR_MARK)
|
||||
return;
|
||||
|
||||
if (dump_file && (dump_flags & TDF_DETAILS))
|
||||
@@ -4998,7 +5012,9 @@ analyze_assign_stmt (gimple *stmt)
|
||||
{
|
||||
rhs = TREE_OPERAND (rhs, 0);
|
||||
if (VAR_OR_FUNCTION_DECL_P (rhs) || TREE_CODE (rhs) == STRING_CST
|
||||
- || TREE_CODE (rhs) == ARRAY_REF || TREE_CODE (rhs) == PARM_DECL)
|
||||
+ || TREE_CODE (rhs) == ARRAY_REF || TREE_CODE (rhs) == PARM_DECL
|
||||
+ || TREE_CODE (rhs) == LABEL_DECL || TREE_CODE (rhs) == CONST_DECL
|
||||
+ || TREE_CODE (rhs) == RESULT_DECL)
|
||||
rhs_type = build_pointer_type (TREE_TYPE (rhs));
|
||||
else if (TREE_CODE (rhs) == COMPONENT_REF)
|
||||
{
|
||||
@@ -5012,7 +5028,12 @@ analyze_assign_stmt (gimple *stmt)
|
||||
gcc_assert (POINTER_TYPE_P (rhs_type));
|
||||
}
|
||||
else
|
||||
- gcc_unreachable();
|
||||
+ {
|
||||
+ fprintf (dump_file, "\nUnsupported rhs type %s in assign stmt: ",
|
||||
+ get_tree_code_name (TREE_CODE (rhs)));
|
||||
+ print_gimple_stmt (dump_file, stmt, 0);
|
||||
+ gcc_unreachable ();
|
||||
+ }
|
||||
}
|
||||
else
|
||||
rhs_type = TREE_TYPE (rhs);
|
||||
@@ -5710,6 +5731,8 @@ merge_fs_map_for_ftype_aliases ()
|
||||
decl_set *d_set = it1->second;
|
||||
tree type = (*type_uid_map)[it1->first];
|
||||
type_set *set = (*fta_map)[it1->first];
|
||||
+ if (!set)
|
||||
+ continue;
|
||||
for (type_set::const_iterator it2 = set->begin ();
|
||||
it2 != set->end (); it2++)
|
||||
{
|
||||
--
|
||||
2.33.0
|
||||
|
||||
1245
0042-Add-split-complex-instructions-pass.patch
Normal file
1245
0042-Add-split-complex-instructions-pass.patch
Normal file
File diff suppressed because it is too large
Load Diff
1426
0043-Extending-and-refactoring-of-pass_split_complex_inst.patch
Normal file
1426
0043-Extending-and-refactoring-of-pass_split_complex_inst.patch
Normal file
File diff suppressed because it is too large
Load Diff
378
0044-Port-maxmin-patch-to-GCC-12.patch
Normal file
378
0044-Port-maxmin-patch-to-GCC-12.patch
Normal file
@ -0,0 +1,378 @@
|
||||
From a3013c074cd2ab5f71eb98a587a627f38c68656c Mon Sep 17 00:00:00 2001
|
||||
From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
|
||||
Date: Thu, 22 Feb 2024 17:07:24 +0800
|
||||
Subject: [PATCH 12/18] Port maxmin patch to GCC 12
|
||||
|
||||
---
|
||||
gcc/config/aarch64/aarch64-simd.md | 256 ++++++++++++++++++++++++++
|
||||
gcc/config/aarch64/predicates.md | 19 ++
|
||||
gcc/testsuite/gcc.dg/combine-maxmin.c | 46 +++++
|
||||
3 files changed, 321 insertions(+)
|
||||
create mode 100755 gcc/testsuite/gcc.dg/combine-maxmin.c
|
||||
|
||||
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
|
||||
index 82f73805f..de92802f5 100644
|
||||
--- a/gcc/config/aarch64/aarch64-simd.md
|
||||
+++ b/gcc/config/aarch64/aarch64-simd.md
|
||||
@@ -1138,6 +1138,82 @@
|
||||
[(set_attr "type" "neon_compare<q>,neon_shift_imm<q>")]
|
||||
)
|
||||
|
||||
+;; Simplify the extension with following truncation for shift+neg operation.
|
||||
+
|
||||
+(define_insn_and_split "*aarch64_sshr_neg_v8hi"
|
||||
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
|
||||
+ (vec_concat:V8HI
|
||||
+ (truncate:V4HI
|
||||
+ (ashiftrt:V4SI
|
||||
+ (neg:V4SI
|
||||
+ (sign_extend:V4SI
|
||||
+ (vec_select:V4HI
|
||||
+ (match_operand:V8HI 1 "register_operand")
|
||||
+ (match_operand:V8HI 3 "vect_par_cnst_lo_half"))))
|
||||
+ (match_operand:V4SI 2 "maxmin_arith_shift_operand")))
|
||||
+ (truncate:V4HI
|
||||
+ (ashiftrt:V4SI
|
||||
+ (neg:V4SI
|
||||
+ (sign_extend:V4SI
|
||||
+ (vec_select:V4HI
|
||||
+ (match_dup 1)
|
||||
+ (match_operand:V8HI 4 "vect_par_cnst_hi_half"))))
|
||||
+ (match_dup 2)))))]
|
||||
+ "TARGET_SIMD"
|
||||
+ "#"
|
||||
+ "&& true"
|
||||
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
|
||||
+ (ashiftrt:V8HI
|
||||
+ (neg:V8HI
|
||||
+ (match_operand:V8HI 1 "register_operand" "w"))
|
||||
+ (match_operand:V8HI 2 "aarch64_simd_imm_minus_one")))]
|
||||
+ {
|
||||
+ /* Reduce the shift amount to smaller mode. */
|
||||
+ int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[2], 0))
|
||||
+ - (GET_MODE_UNIT_BITSIZE (GET_MODE (operands[2])) / 2);
|
||||
+ operands[2] = aarch64_simd_gen_const_vector_dup (V8HImode, val);
|
||||
+ }
|
||||
+ [(set_attr "type" "multiple")]
|
||||
+)
|
||||
+
|
||||
+;; The helper definition that allows combiner to use the previous pattern.
|
||||
+
|
||||
+(define_insn_and_split "*aarch64_sshr_neg_tmpv8hi"
|
||||
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
|
||||
+ (vec_concat:V8HI
|
||||
+ (truncate:V4HI
|
||||
+ (ashiftrt:V4SI
|
||||
+ (neg:V4SI
|
||||
+ (match_operand:V4SI 1 "register_operand" "w"))
|
||||
+ (match_operand:V4SI 2 "maxmin_arith_shift_operand")))
|
||||
+ (truncate:V4HI
|
||||
+ (ashiftrt:V4SI
|
||||
+ (neg:V4SI
|
||||
+ (match_operand:V4SI 3 "register_operand" "w"))
|
||||
+ (match_dup 2)))))]
|
||||
+ "TARGET_SIMD"
|
||||
+ "#"
|
||||
+ "&& true"
|
||||
+ [(set (match_operand:V4SI 1 "register_operand" "=w")
|
||||
+ (ashiftrt:V4SI
|
||||
+ (neg:V4SI
|
||||
+ (match_dup 1))
|
||||
+ (match_operand:V4SI 2 "maxmin_arith_shift_operand")))
|
||||
+ (set (match_operand:V4SI 3 "register_operand" "=w")
|
||||
+ (ashiftrt:V4SI
|
||||
+ (neg:V4SI
|
||||
+ (match_dup 3))
|
||||
+ (match_dup 2)))
|
||||
+ (set (match_operand:V8HI 0 "register_operand" "=w")
|
||||
+ (vec_concat:V8HI
|
||||
+ (truncate:V4HI
|
||||
+ (match_dup 1))
|
||||
+ (truncate:V4HI
|
||||
+ (match_dup 3))))]
|
||||
+ ""
|
||||
+ [(set_attr "type" "multiple")]
|
||||
+)
|
||||
+
|
||||
(define_insn "*aarch64_simd_sra<mode>"
|
||||
[(set (match_operand:VDQ_I 0 "register_operand" "=w")
|
||||
(plus:VDQ_I
|
||||
@@ -1714,6 +1790,26 @@
|
||||
}
|
||||
)
|
||||
|
||||
+(define_insn "vec_pack_trunc_shifted_<mode>"
|
||||
+ [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=&w")
|
||||
+ (vec_concat:<VNARROWQ2>
|
||||
+ (truncate:<VNARROWQ>
|
||||
+ (ashiftrt:VQN (match_operand:VQN 1 "register_operand" "w")
|
||||
+ (match_operand:VQN 2 "half_size_operand" "w")))
|
||||
+ (truncate:<VNARROWQ>
|
||||
+ (ashiftrt:VQN (match_operand:VQN 3 "register_operand" "w")
|
||||
+ (match_operand:VQN 4 "half_size_operand" "w")))))]
|
||||
+ "TARGET_SIMD"
|
||||
+ {
|
||||
+ if (BYTES_BIG_ENDIAN)
|
||||
+ return "uzp2\\t%0.<V2ntype>, %3.<V2ntype>, %1.<V2ntype>";
|
||||
+ else
|
||||
+ return "uzp2\\t%0.<V2ntype>, %1.<V2ntype>, %3.<V2ntype>";
|
||||
+ }
|
||||
+ [(set_attr "type" "neon_permute<q>")
|
||||
+ (set_attr "length" "4")]
|
||||
+)
|
||||
+
|
||||
(define_insn "aarch64_shrn<mode>_insn_le"
|
||||
[(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
|
||||
(vec_concat:<VNARROWQ2>
|
||||
@@ -6652,6 +6748,166 @@
|
||||
[(set_attr "type" "neon_tst<q>")]
|
||||
)
|
||||
|
||||
+;; Simplify the extension with following truncation for cmtst-like operation.
|
||||
+
|
||||
+(define_insn_and_split "*aarch64_cmtst_arith_v8hi"
|
||||
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
|
||||
+ (vec_concat:V8HI
|
||||
+ (plus:V4HI
|
||||
+ (truncate:V4HI
|
||||
+ (eq:V4SI
|
||||
+ (sign_extend:V4SI
|
||||
+ (vec_select:V4HI
|
||||
+ (and:V8HI
|
||||
+ (match_operand:V8HI 1 "register_operand")
|
||||
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
|
||||
+ (match_operand:V8HI 3 "vect_par_cnst_lo_half")))
|
||||
+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero")))
|
||||
+ (match_operand:V4HI 5 "aarch64_simd_imm_minus_one"))
|
||||
+ (plus:V4HI
|
||||
+ (truncate:V4HI
|
||||
+ (eq:V4SI
|
||||
+ (sign_extend:V4SI
|
||||
+ (vec_select:V4HI
|
||||
+ (and:V8HI
|
||||
+ (match_dup 1)
|
||||
+ (match_dup 2))
|
||||
+ (match_operand:V8HI 6 "vect_par_cnst_hi_half")))
|
||||
+ (match_dup 4)))
|
||||
+ (match_dup 5))))]
|
||||
+ "TARGET_SIMD && !reload_completed"
|
||||
+ "#"
|
||||
+ "&& true"
|
||||
+ [(set (match_operand:V8HI 6 "register_operand" "=w")
|
||||
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
|
||||
+ (set (match_operand:V8HI 0 "register_operand" "=w")
|
||||
+ (plus:V8HI
|
||||
+ (eq:V8HI
|
||||
+ (and:V8HI
|
||||
+ (match_operand:V8HI 1 "register_operand" "w")
|
||||
+ (match_dup 6))
|
||||
+ (match_operand:V8HI 4 "aarch64_simd_imm_zero"))
|
||||
+ (match_operand:V8HI 5 "aarch64_simd_imm_minus_one")))]
|
||||
+ {
|
||||
+ if (can_create_pseudo_p ())
|
||||
+ {
|
||||
+ int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[4], 0));
|
||||
+ operands[4] = aarch64_simd_gen_const_vector_dup (V8HImode, val);
|
||||
+ int val2 = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[5], 0));
|
||||
+ operands[5] = aarch64_simd_gen_const_vector_dup (V8HImode, val2);
|
||||
+
|
||||
+ operands[6] = gen_reg_rtx (V8HImode);
|
||||
+ }
|
||||
+ else
|
||||
+ FAIL;
|
||||
+ }
|
||||
+ [(set_attr "type" "neon_tst_q")]
|
||||
+)
|
||||
+
|
||||
+;; Three helper definitions that allow combiner to use the previous pattern.
|
||||
+
|
||||
+(define_insn_and_split "*aarch64_cmtst_arith_tmp_lo_v8hi"
|
||||
+ [(set (match_operand:V4SI 0 "register_operand" "=w")
|
||||
+ (neg:V4SI
|
||||
+ (eq:V4SI
|
||||
+ (sign_extend:V4SI
|
||||
+ (vec_select:V4HI
|
||||
+ (and:V8HI
|
||||
+ (match_operand:V8HI 1 "register_operand")
|
||||
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
|
||||
+ (match_operand:V8HI 3 "vect_par_cnst_lo_half")))
|
||||
+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
|
||||
+ "TARGET_SIMD && !reload_completed"
|
||||
+ "#"
|
||||
+ "&& true"
|
||||
+ [(set (match_operand:V8HI 5 "register_operand" "=w")
|
||||
+ (and:V8HI
|
||||
+ (match_operand:V8HI 1 "register_operand")
|
||||
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")))
|
||||
+ (set (match_operand:V4SI 0 "register_operand" "=w")
|
||||
+ (sign_extend:V4SI
|
||||
+ (vec_select:V4HI
|
||||
+ (match_dup 5)
|
||||
+ (match_operand:V8HI 3 "vect_par_cnst_lo_half"))))
|
||||
+ (set (match_dup 0)
|
||||
+ (neg:V4SI
|
||||
+ (eq:V4SI
|
||||
+ (match_dup 0)
|
||||
+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
|
||||
+ {
|
||||
+ if (can_create_pseudo_p ())
|
||||
+ operands[5] = gen_reg_rtx (V8HImode);
|
||||
+ else
|
||||
+ FAIL;
|
||||
+ }
|
||||
+ [(set_attr "type" "multiple")]
|
||||
+)
|
||||
+
|
||||
+(define_insn_and_split "*aarch64_cmtst_arith_tmp_hi_v8hi"
|
||||
+ [(set (match_operand:V4SI 0 "register_operand" "=w")
|
||||
+ (neg:V4SI
|
||||
+ (eq:V4SI
|
||||
+ (sign_extend:V4SI
|
||||
+ (vec_select:V4HI
|
||||
+ (and:V8HI
|
||||
+ (match_operand:V8HI 1 "register_operand")
|
||||
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
|
||||
+ (match_operand:V8HI 3 "vect_par_cnst_hi_half")))
|
||||
+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
|
||||
+ "TARGET_SIMD && !reload_completed"
|
||||
+ "#"
|
||||
+ "&& true"
|
||||
+ [(set (match_operand:V8HI 5 "register_operand" "=w")
|
||||
+ (and:V8HI
|
||||
+ (match_operand:V8HI 1 "register_operand")
|
||||
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")))
|
||||
+ (set (match_operand:V4SI 0 "register_operand" "=w")
|
||||
+ (sign_extend:V4SI
|
||||
+ (vec_select:V4HI
|
||||
+ (match_dup 5)
|
||||
+ (match_operand:V8HI 3 "vect_par_cnst_hi_half"))))
|
||||
+ (set (match_dup 0)
|
||||
+ (neg:V4SI
|
||||
+ (eq:V4SI
|
||||
+ (match_dup 0)
|
||||
+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
|
||||
+ {
|
||||
+ if (can_create_pseudo_p ())
|
||||
+ operands[5] = gen_reg_rtx (V8HImode);
|
||||
+ else
|
||||
+ FAIL;
|
||||
+ }
|
||||
+ [(set_attr "type" "multiple")]
|
||||
+)
|
||||
+
|
||||
+(define_insn_and_split "*aarch64_cmtst_arith_tmpv8hi"
|
||||
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
|
||||
+ (vec_concat:V8HI
|
||||
+ (truncate:V4HI
|
||||
+ (not:V4SI
|
||||
+ (match_operand:V4SI 1 "register_operand" "w")))
|
||||
+ (truncate:V4HI
|
||||
+ (not:V4SI
|
||||
+ (match_operand:V4SI 2 "register_operand" "w")))))]
|
||||
+ "TARGET_SIMD"
|
||||
+ "#"
|
||||
+ "&& true"
|
||||
+ [(set (match_operand:V4SI 1 "register_operand" "=w")
|
||||
+ (not:V4SI
|
||||
+ (match_dup 1)))
|
||||
+ (set (match_operand:V4SI 2 "register_operand" "=w")
|
||||
+ (not:V4SI
|
||||
+ (match_dup 2)))
|
||||
+ (set (match_operand:V8HI 0 "register_operand" "=w")
|
||||
+ (vec_concat:V8HI
|
||||
+ (truncate:V4HI
|
||||
+ (match_dup 1))
|
||||
+ (truncate:V4HI
|
||||
+ (match_dup 2))))]
|
||||
+ ""
|
||||
+ [(set_attr "type" "multiple")]
|
||||
+)
|
||||
+
|
||||
(define_insn_and_split "aarch64_cmtstdi"
|
||||
[(set (match_operand:DI 0 "register_operand" "=w,r")
|
||||
(neg:DI
|
||||
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
|
||||
index 07c14aacb..1b8496c07 100644
|
||||
--- a/gcc/config/aarch64/predicates.md
|
||||
+++ b/gcc/config/aarch64/predicates.md
|
||||
@@ -118,6 +118,25 @@
|
||||
(match_test "aarch64_simd_valid_immediate (op, NULL,
|
||||
AARCH64_CHECK_ORR)"))))
|
||||
|
||||
+(define_predicate "aarch64_bic_imm_for_maxmin"
|
||||
+ (match_code "const_vector")
|
||||
+{
|
||||
+ if (!aarch64_simd_valid_immediate (op, NULL, AARCH64_CHECK_BIC))
|
||||
+ return false;
|
||||
+ op = unwrap_const_vec_duplicate (op);
|
||||
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode);
|
||||
+ return CONST_INT_P (op)
|
||||
+ && ((~UINTVAL (op)) < (((long unsigned int) 1 << size) - 1));
|
||||
+})
|
||||
+
|
||||
+(define_predicate "maxmin_arith_shift_operand"
|
||||
+ (match_code "const_vector")
|
||||
+{
|
||||
+ op = unwrap_const_vec_duplicate (op);
|
||||
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) - 1;
|
||||
+ return CONST_INT_P (op) && (UINTVAL (op) == size);
|
||||
+})
|
||||
+
|
||||
(define_predicate "aarch64_reg_or_bic_imm"
|
||||
(ior (match_operand 0 "register_operand")
|
||||
(and (match_code "const_vector")
|
||||
diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c
|
||||
new file mode 100755
|
||||
index 000000000..06bce7029
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.dg/combine-maxmin.c
|
||||
@@ -0,0 +1,46 @@
|
||||
+/* { dg-do compile { target aarch64-*-* } } */
|
||||
+/* { dg-options "-O3 -fdump-rtl-combine-all" } */
|
||||
+
|
||||
+/* The test checks usage of smax/smin insns for clip evaluation and
|
||||
+ * uzp1/uzp2 insns for vector element narrowing. It's inspired by
|
||||
+ * sources of x264 codec. */
|
||||
+
|
||||
+typedef unsigned char uint8_t;
|
||||
+typedef long int intptr_t;
|
||||
+typedef signed short int int16_t;
|
||||
+
|
||||
+static __attribute__((always_inline)) inline uint8_t clip (int x )
|
||||
+{
|
||||
+ return ( (x & ~((1 << 8)-1)) ? (-x)>>31 & ((1 << 8)-1) : x );
|
||||
+}
|
||||
+
|
||||
+void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
|
||||
+ intptr_t stride, int width, int height, int16_t *buf)
|
||||
+{
|
||||
+ const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0;
|
||||
+ for( int y = 0; y < height; y++ ) {
|
||||
+ for( int x = -2; x < width+3; x++ ) {
|
||||
+ int v = ((src)[x-2*stride] + (src)[x+3*stride] - 5*((src)[x-stride]
|
||||
+ + (src)[x+2*stride]) + 20*((src)[x] + (src)[x+stride]));
|
||||
+ dstv[x] = clip ( (v + 16) >> 5 );
|
||||
+ buf[x+2] = v + pad;
|
||||
+ }
|
||||
+ for( int x = 0; x < width; x++ )
|
||||
+ dstc[x] = clip ((((buf+2)[x-2*1] + (buf+2)[x+3*1] - 5*((buf+2)[x-1]
|
||||
+ + (buf+2)[x+2*1]) + 20*((buf+2)[x] + (buf+2)[x+1]))
|
||||
+ - 32*pad + 512) >> 10);
|
||||
+ for( int x = 0; x < width; x++ )
|
||||
+ dsth[x] = clip ((((src)[x-2*1] + (src)[x+3*1] - 5*((src)[x-1]
|
||||
+ + (src)[x+2*1]) + 20*((src)[x] + (src)[x+1]))
|
||||
+ + 16) >> 5);
|
||||
+ dsth += stride;
|
||||
+ dstv += stride;
|
||||
+ dstc += stride;
|
||||
+ src += stride;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+/* { dg-final { scan-assembler-times {smax\t} 4 } } */
|
||||
+/* { dg-final { scan-assembler-times {smin\t} 4 } } */
|
||||
+/* { dg-final { scan-assembler-times {cmtst\t} 2 } } */
|
||||
+/* { dg-final { scan-assembler-times {uzp1\t} 6 } } */
|
||||
--
|
||||
2.33.0
|
||||
|
||||
239
0045-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch
Normal file
239
0045-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch
Normal file
@ -0,0 +1,239 @@
|
||||
From 11da40d18e35219961226d40f11b0702b8649044 Mon Sep 17 00:00:00 2001
|
||||
From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
|
||||
Date: Thu, 22 Feb 2024 17:13:27 +0800
|
||||
Subject: [PATCH 13/18] Port moving minmask pattern to gimple to GCC 12
|
||||
|
||||
---
|
||||
gcc/common.opt | 4 +
|
||||
gcc/match.pd | 104 ++++++++++++++++++++++++
|
||||
gcc/testsuite/gcc.dg/combine-maxmin-1.c | 15 ++++
|
||||
gcc/testsuite/gcc.dg/combine-maxmin-2.c | 14 ++++
|
||||
gcc/testsuite/gcc.dg/combine-maxmin.c | 19 +++--
|
||||
5 files changed, 151 insertions(+), 5 deletions(-)
|
||||
create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-1.c
|
||||
create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-2.c
|
||||
|
||||
diff --git a/gcc/common.opt b/gcc/common.opt
|
||||
index 6c6fabb31..3a5004271 100644
|
||||
--- a/gcc/common.opt
|
||||
+++ b/gcc/common.opt
|
||||
@@ -1846,6 +1846,10 @@ fif-conversion-gimple
|
||||
Common Var(flag_if_conversion_gimple) Optimization
|
||||
Perform conversion of conditional jumps to branchless equivalents during gimple transformations.
|
||||
|
||||
+fconvert-minmax
|
||||
+Common Var(flag_convert_minmax) Optimization
|
||||
+Convert saturating clipping to min max.
|
||||
+
|
||||
fstack-reuse=
|
||||
Common Joined RejectNegative Enum(stack_reuse_level) Var(flag_stack_reuse) Init(SR_ALL) Optimization
|
||||
-fstack-reuse=[all|named_vars|none] Set stack reuse level for local variables.
|
||||
diff --git a/gcc/match.pd b/gcc/match.pd
|
||||
index 61866cb90..3a19e93b3 100644
|
||||
--- a/gcc/match.pd
|
||||
+++ b/gcc/match.pd
|
||||
@@ -8031,3 +8031,107 @@ and,
|
||||
(plus:c@4 (op2:c @0 @1)
|
||||
(plus:c@5 (double_size_mul_overflow_check_lo @0 @1 @3) (op3:c @0 @1))))
|
||||
(if (single_use (@4) && single_use (@5)))))
|
||||
+
|
||||
+/* MinMax pattern matching helpers. More info on the transformation below. */
|
||||
+
|
||||
+/* Match (a & 0b11..100..0) pattern. */
|
||||
+(match (minmax_cmp_arg @0 @1)
|
||||
+ (bit_and @0 INTEGER_CST@1)
|
||||
+ (if (wi::popcount (~wi::to_widest (@1) + 1) == 1)))
|
||||
+
|
||||
+/* Match (inversed_sign_bit >> sign_bit_pos) pattern.
|
||||
+ This statement is blocking for the transformation of unsigned integers.
|
||||
+ Do type check here to avoid unnecessary duplications. */
|
||||
+(match (minmax_sat_arg @0)
|
||||
+ (rshift (negate @0) INTEGER_CST@1)
|
||||
+ (if (!TYPE_UNSIGNED (TREE_TYPE (@0))
|
||||
+ && wi::eq_p (wi::to_widest (@1), TYPE_PRECISION (TREE_TYPE (@0)) - 1))))
|
||||
+
|
||||
+/* Transform ((x & ~mask) ? (-x)>>31 & mask : x) to (min (max (x, 0), mask)).
|
||||
+ The matched pattern can be described as saturated clipping.
|
||||
+
|
||||
+ The pattern supports truncation via both casts and bit_and.
|
||||
+ Also there are patterns for possible inverted conditions. */
|
||||
+(if (flag_convert_minmax)
|
||||
+/* Truncation via casts. Unfortunately convert? cannot be applied here
|
||||
+ because convert and cond take different number of arguments. */
|
||||
+ (simplify
|
||||
+ (convert
|
||||
+ (cond
|
||||
+ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
|
||||
+ (convert? (minmax_sat_arg @0))
|
||||
+ (convert? @0)))
|
||||
+ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
|
||||
+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
|
||||
+ (convert (min (max @0 { integer_zero_node; })
|
||||
+ { mask; })))))
|
||||
+ (simplify
|
||||
+ (cond
|
||||
+ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
|
||||
+ (convert? (minmax_sat_arg @0))
|
||||
+ (convert? @0))
|
||||
+ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
|
||||
+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
|
||||
+ (convert (min (max @0 { integer_zero_node; })
|
||||
+ { mask; })))))
|
||||
+
|
||||
+ (simplify
|
||||
+ (convert
|
||||
+ (cond
|
||||
+ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
|
||||
+ (convert? @0)
|
||||
+ (convert? (minmax_sat_arg @0))))
|
||||
+ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
|
||||
+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
|
||||
+ (convert (min (max @0 { integer_zero_node; })
|
||||
+ { mask; })))))
|
||||
+ (simplify
|
||||
+ (cond
|
||||
+ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
|
||||
+ (convert? @0)
|
||||
+ (convert? (minmax_sat_arg @0)))
|
||||
+ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
|
||||
+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
|
||||
+ (convert (min (max @0 { integer_zero_node; })
|
||||
+ { mask; })))))
|
||||
+
|
||||
+ /* Truncation via bit_and with mask. Same concerns on convert? here. */
|
||||
+ (simplify
|
||||
+ (convert
|
||||
+ (cond
|
||||
+ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
|
||||
+ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))
|
||||
+ (convert? @0)))
|
||||
+ (if (wi::to_widest (@2) == ~wi::to_widest (@1))
|
||||
+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
|
||||
+ (convert (min (max @0 { integer_zero_node; })
|
||||
+ { mask; })))))
|
||||
+ (simplify
|
||||
+ (cond
|
||||
+ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
|
||||
+ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))
|
||||
+ (convert? @0))
|
||||
+ (if (wi::to_widest (@2) == ~wi::to_widest (@1))
|
||||
+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
|
||||
+ (convert (min (max @0 { integer_zero_node; })
|
||||
+ { mask; })))))
|
||||
+
|
||||
+ (simplify
|
||||
+ (convert
|
||||
+ (cond
|
||||
+ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
|
||||
+ (convert? @0)
|
||||
+ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))))
|
||||
+ (if (wi::to_widest (@2) == ~wi::to_widest (@1))
|
||||
+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
|
||||
+ (convert (min (max @0 { integer_zero_node; })
|
||||
+ { mask; })))))
|
||||
+ (simplify
|
||||
+ (cond
|
||||
+ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
|
||||
+ (convert? @0)
|
||||
+ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2)))
|
||||
+ (if (wi::to_widest (@2) == ~wi::to_widest (@1))
|
||||
+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
|
||||
+ (convert (min (max @0 { integer_zero_node; })
|
||||
+ { mask; }))))))
|
||||
diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-1.c b/gcc/testsuite/gcc.dg/combine-maxmin-1.c
|
||||
new file mode 100644
|
||||
index 000000000..859ff7df8
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.dg/combine-maxmin-1.c
|
||||
@@ -0,0 +1,15 @@
|
||||
+/* { dg-do compile { target aarch64-*-* } } */
|
||||
+/* { dg-options "-O3 -fconvert-minmax" } */
|
||||
+
|
||||
+#include <inttypes.h>
|
||||
+
|
||||
+__attribute__((noinline))
|
||||
+void test (int32_t *restrict a, int32_t *restrict x)
|
||||
+{
|
||||
+ for (int i = 0; i < 4; i++)
|
||||
+ a[i] = ((((-x[i]) >> 31) ^ x[i])
|
||||
+ & (-((int32_t)((x[i] & (~((1 << 8)-1))) == 0)))) ^ ((-x[i]) >> 31);
|
||||
+}
|
||||
+
|
||||
+/* { dg-final { scan-assembler-not {smax\t} } } */
|
||||
+/* { dg-final { scan-assembler-not {smin\t} } } */
|
||||
diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-2.c b/gcc/testsuite/gcc.dg/combine-maxmin-2.c
|
||||
new file mode 100644
|
||||
index 000000000..63d4d85b3
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.dg/combine-maxmin-2.c
|
||||
@@ -0,0 +1,14 @@
|
||||
+/* { dg-do compile { target aarch64-*-* } } */
|
||||
+/* { dg-options "-O3 -fconvert-minmax" } */
|
||||
+
|
||||
+#include <inttypes.h>
|
||||
+
|
||||
+__attribute__((noinline))
|
||||
+void test (int8_t *restrict a, int32_t *restrict x)
|
||||
+{
|
||||
+ for (int i = 0; i < 8; i++)
|
||||
+ a[i] = ((x[i] & ~((1 << 9)-1)) ? (-x[i])>>31 & ((1 << 9)-1) : x[i]);
|
||||
+}
|
||||
+
|
||||
+/* { dg-final { scan-assembler-times {smax\t} 4 } } */
|
||||
+/* { dg-final { scan-assembler-times {smin\t} 4 } } */
|
||||
diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c
|
||||
index 06bce7029..a984fa560 100755
|
||||
--- a/gcc/testsuite/gcc.dg/combine-maxmin.c
|
||||
+++ b/gcc/testsuite/gcc.dg/combine-maxmin.c
|
||||
@@ -1,5 +1,5 @@
|
||||
/* { dg-do compile { target aarch64-*-* } } */
|
||||
-/* { dg-options "-O3 -fdump-rtl-combine-all" } */
|
||||
+/* { dg-options "-O3 -fconvert-minmax" } */
|
||||
|
||||
/* The test checks usage of smax/smin insns for clip evaluation and
|
||||
* uzp1/uzp2 insns for vector element narrowing. It's inspired by
|
||||
@@ -19,20 +19,26 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
|
||||
{
|
||||
const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0;
|
||||
for( int y = 0; y < height; y++ ) {
|
||||
+ /* This loop is not being vectorized now. */
|
||||
for( int x = -2; x < width+3; x++ ) {
|
||||
int v = ((src)[x-2*stride] + (src)[x+3*stride] - 5*((src)[x-stride]
|
||||
+ (src)[x+2*stride]) + 20*((src)[x] + (src)[x+stride]));
|
||||
dstv[x] = clip ( (v + 16) >> 5 );
|
||||
buf[x+2] = v + pad;
|
||||
}
|
||||
+
|
||||
+ /* Produces two versions of the code: 3xUZP1/2xMAX/2xMIN + 1xUZP1/1xMAX/1xMIN. */
|
||||
for( int x = 0; x < width; x++ )
|
||||
dstc[x] = clip ((((buf+2)[x-2*1] + (buf+2)[x+3*1] - 5*((buf+2)[x-1]
|
||||
+ (buf+2)[x+2*1]) + 20*((buf+2)[x] + (buf+2)[x+1]))
|
||||
- 32*pad + 512) >> 10);
|
||||
+
|
||||
+ /* Priduces two versions of the code: 1xUZP1/2xMAX/2xMIN + 0xUZP1/1xMAX/1xMIN. */
|
||||
for( int x = 0; x < width; x++ )
|
||||
dsth[x] = clip ((((src)[x-2*1] + (src)[x+3*1] - 5*((src)[x-1]
|
||||
+ (src)[x+2*1]) + 20*((src)[x] + (src)[x+1]))
|
||||
+ 16) >> 5);
|
||||
+
|
||||
dsth += stride;
|
||||
dstv += stride;
|
||||
dstc += stride;
|
||||
@@ -40,7 +46,10 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
|
||||
}
|
||||
}
|
||||
|
||||
-/* { dg-final { scan-assembler-times {smax\t} 4 } } */
|
||||
-/* { dg-final { scan-assembler-times {smin\t} 4 } } */
|
||||
-/* { dg-final { scan-assembler-times {cmtst\t} 2 } } */
|
||||
-/* { dg-final { scan-assembler-times {uzp1\t} 6 } } */
|
||||
+/* Max is performed on 0 from signed values, match smax exactly. */
|
||||
+/* { dg-final { scan-assembler-times {smax\t} 6 } } */
|
||||
+/* Min is performed on signed val>0 and a mask, min sign doesn't matter. */
|
||||
+/* { dg-final { scan-assembler-times {[us]min\t} 6 } } */
|
||||
+/* All of the vectorized patterns are expected to be matched. */
|
||||
+/* { dg-final { scan-assembler-not {cmtst\t} } } */
|
||||
+/* { dg-final { scan-assembler-times {uzp1\t} 5 } } */
|
||||
--
|
||||
2.33.0
|
||||
|
||||
65
0046-Add-new-pattern-to-pass-the-maxmin-tests.patch
Normal file
65
0046-Add-new-pattern-to-pass-the-maxmin-tests.patch
Normal file
@ -0,0 +1,65 @@
|
||||
From dbcb2630c426c8dd2117b5ce625da8422dd8cd65 Mon Sep 17 00:00:00 2001
|
||||
From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
|
||||
Date: Thu, 22 Feb 2024 17:20:17 +0800
|
||||
Subject: [PATCH 14/18] Add new pattern to pass the maxmin tests
|
||||
|
||||
---
|
||||
gcc/match.pd | 24 ++++++++++++++++++++++++
|
||||
gcc/testsuite/gcc.dg/combine-maxmin.c | 2 +-
|
||||
2 files changed, 25 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/gcc/match.pd b/gcc/match.pd
|
||||
index 3a19e93b3..aee58e47b 100644
|
||||
--- a/gcc/match.pd
|
||||
+++ b/gcc/match.pd
|
||||
@@ -8038,6 +8038,10 @@ and,
|
||||
(match (minmax_cmp_arg @0 @1)
|
||||
(bit_and @0 INTEGER_CST@1)
|
||||
(if (wi::popcount (~wi::to_widest (@1) + 1) == 1)))
|
||||
+/* Match ((unsigned) a > 0b0..01..1) pattern. */
|
||||
+(match (minmax_cmp_arg1 @0 @1)
|
||||
+ (gt @0 INTEGER_CST@1)
|
||||
+ (if (wi::popcount (wi::to_widest (@1) + 1) == 1)))
|
||||
|
||||
/* Match (inversed_sign_bit >> sign_bit_pos) pattern.
|
||||
This statement is blocking for the transformation of unsigned integers.
|
||||
@@ -8095,6 +8099,26 @@ and,
|
||||
(convert (min (max @0 { integer_zero_node; })
|
||||
{ mask; })))))
|
||||
|
||||
+ (simplify
|
||||
+ (convert
|
||||
+ (cond
|
||||
+ (minmax_cmp_arg1 (convert? @0) INTEGER_CST@1)
|
||||
+ (convert? (minmax_sat_arg @0))
|
||||
+ (convert? @0)))
|
||||
+ (if (wi::geu_p (wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
|
||||
+ (with { tree mask = build_int_cst (integer_type_node, tree_to_shwi (@1)); }
|
||||
+ (convert (min (max (convert:integer_type_node @0) { integer_zero_node; })
|
||||
+ { mask; })))))
|
||||
+ (simplify
|
||||
+ (cond
|
||||
+ (minmax_cmp_arg1 (convert? @0) INTEGER_CST@1)
|
||||
+ (convert? (minmax_sat_arg @0))
|
||||
+ (convert? @0))
|
||||
+ (if (wi::geu_p (wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
|
||||
+ (with { tree mask = build_int_cst (integer_type_node, tree_to_shwi (@1)); }
|
||||
+ (convert (min (max (convert:integer_type_node @0) { integer_zero_node; })
|
||||
+ { mask; })))))
|
||||
+
|
||||
/* Truncation via bit_and with mask. Same concerns on convert? here. */
|
||||
(simplify
|
||||
(convert
|
||||
diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c
|
||||
index a984fa560..5c0c9cc49 100755
|
||||
--- a/gcc/testsuite/gcc.dg/combine-maxmin.c
|
||||
+++ b/gcc/testsuite/gcc.dg/combine-maxmin.c
|
||||
@@ -52,4 +52,4 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
|
||||
/* { dg-final { scan-assembler-times {[us]min\t} 6 } } */
|
||||
/* All of the vectorized patterns are expected to be matched. */
|
||||
/* { dg-final { scan-assembler-not {cmtst\t} } } */
|
||||
-/* { dg-final { scan-assembler-times {uzp1\t} 5 } } */
|
||||
+/* { dg-final { scan-assembler-times {uzp1\t} 2 } } */
|
||||
--
|
||||
2.33.0
|
||||
|
||||
3968
0047-AES-Implement-AES-pattern-matching.patch
Normal file
3968
0047-AES-Implement-AES-pattern-matching.patch
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,27 @@
|
||||
From 915d549b03c10ab403538888149facd417a02ebc Mon Sep 17 00:00:00 2001
|
||||
From: vchernon <chernonog.vyacheslav@huawei.com>
|
||||
Date: Wed, 27 Dec 2023 23:31:26 +0800
|
||||
Subject: [PATCH 16/18] [crypto-accel] add optimization level requirement to
|
||||
the gate
|
||||
|
||||
fix issue (src-openEuler/gcc: I8RRDW)
|
||||
---
|
||||
gcc/crypto-accel.cc | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/gcc/crypto-accel.cc b/gcc/crypto-accel.cc
|
||||
index f4e810a6b..e7766a585 100644
|
||||
--- a/gcc/crypto-accel.cc
|
||||
+++ b/gcc/crypto-accel.cc
|
||||
@@ -2391,7 +2391,7 @@ public:
|
||||
/* opt_pass methods: */
|
||||
virtual bool gate (function *)
|
||||
{
|
||||
- if (flag_crypto_accel_aes <= 0)
|
||||
+ if (flag_crypto_accel_aes <= 0 || optimize < 1)
|
||||
return false;
|
||||
return targetm.get_v16qi_mode
|
||||
&& targetm.gen_rev32v16qi
|
||||
--
|
||||
2.33.0
|
||||
|
||||
239
0049-Add-more-flexible-check-for-pointer-aliasing-during-.patch
Normal file
239
0049-Add-more-flexible-check-for-pointer-aliasing-during-.patch
Normal file
@ -0,0 +1,239 @@
|
||||
From b5865aef36ebaac87ae30d51f08bfe081795ed67 Mon Sep 17 00:00:00 2001
|
||||
From: Chernonog Viacheslav <chernonog.vyacheslav@huawei.com>
|
||||
Date: Tue, 12 Mar 2024 23:30:56 +0800
|
||||
Subject: [PATCH 17/18] Add more flexible check for pointer aliasing during
|
||||
vectorization It takes minimum between number of iteration and segment length
|
||||
it helps to speed up loops with small number of iterations when only tail can
|
||||
be vectorized
|
||||
|
||||
---
|
||||
gcc/params.opt | 5 ++
|
||||
.../sve/var_stride_flexible_segment_len_1.c | 23 +++++++
|
||||
gcc/tree-data-ref.cc | 67 +++++++++++++------
|
||||
gcc/tree-data-ref.h | 11 ++-
|
||||
gcc/tree-vect-data-refs.cc | 14 +++-
|
||||
5 files changed, 95 insertions(+), 25 deletions(-)
|
||||
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
|
||||
|
||||
diff --git a/gcc/params.opt b/gcc/params.opt
|
||||
index 6176d4790..7e5c119cf 100644
|
||||
--- a/gcc/params.opt
|
||||
+++ b/gcc/params.opt
|
||||
@@ -1180,6 +1180,11 @@ Maximum number of loop peels to enhance alignment of data references in a loop.
|
||||
Common Joined UInteger Var(param_vect_max_version_for_alias_checks) Init(10) Param Optimization
|
||||
Bound on number of runtime checks inserted by the vectorizer's loop versioning for alias check.
|
||||
|
||||
+-param=vect-alias-flexible-segment-len=
|
||||
+Common Joined UInteger Var(param_flexible_seg_len) Init(0) IntegerRange(0, 1) Param Optimization
|
||||
+Use a minimum length of different segments. Currenlty the minimum between
|
||||
+iteration number and vectorization length is chosen by this param.
|
||||
+
|
||||
-param=vect-max-version-for-alignment-checks=
|
||||
Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization
|
||||
Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check.
|
||||
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
|
||||
new file mode 100644
|
||||
index 000000000..894f075f3
|
||||
--- /dev/null
|
||||
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
|
||||
@@ -0,0 +1,23 @@
|
||||
+/* { dg-do compile } */
|
||||
+/* { dg-options "-O2 -ftree-vectorize --param=vect-alias-flexible-segment-len=1" } */
|
||||
+
|
||||
+#define TYPE int
|
||||
+#define SIZE 257
|
||||
+
|
||||
+void __attribute__ ((weak))
|
||||
+f (TYPE *x, TYPE *y, unsigned short n, long m __attribute__((unused)))
|
||||
+{
|
||||
+ for (int i = 0; i < SIZE; ++i)
|
||||
+ x[i * n] += y[i * n];
|
||||
+}
|
||||
+
|
||||
+/* { dg-final { scan-assembler {\tld1w\tz[0-9]+} } } */
|
||||
+/* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */
|
||||
+/* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
|
||||
+/* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
|
||||
+/* Should use a WAR check that multiplies by (VF-2)*4 rather than
|
||||
+ an overlap check that multiplies by (257-1)*4. */
|
||||
+/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #8\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */
|
||||
+/* One range check and a check for n being zero. */
|
||||
+/* { dg-final { scan-assembler-times {\t(?:cmp|tst)\t} 2 } } */
|
||||
+/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */
|
||||
diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
|
||||
index 397792c35..e6ae9e847 100644
|
||||
--- a/gcc/tree-data-ref.cc
|
||||
+++ b/gcc/tree-data-ref.cc
|
||||
@@ -2329,31 +2329,15 @@ create_intersect_range_checks_index (class loop *loop, tree *cond_expr,
|
||||
same arguments. Try to optimize cases in which the second access
|
||||
is a write and in which some overlap is valid. */
|
||||
|
||||
-static bool
|
||||
-create_waw_or_war_checks (tree *cond_expr,
|
||||
+static void
|
||||
+create_waw_or_war_checks2 (tree *cond_expr, tree seg_len_a,
|
||||
const dr_with_seg_len_pair_t &alias_pair)
|
||||
{
|
||||
const dr_with_seg_len& dr_a = alias_pair.first;
|
||||
const dr_with_seg_len& dr_b = alias_pair.second;
|
||||
|
||||
- /* Check for cases in which:
|
||||
-
|
||||
- (a) DR_B is always a write;
|
||||
- (b) the accesses are well-ordered in both the original and new code
|
||||
- (see the comment above the DR_ALIAS_* flags for details); and
|
||||
- (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */
|
||||
- if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
|
||||
- return false;
|
||||
-
|
||||
- /* Check for equal (but possibly variable) steps. */
|
||||
tree step = DR_STEP (dr_a.dr);
|
||||
- if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
|
||||
- return false;
|
||||
-
|
||||
- /* Make sure that we can operate on sizetype without loss of precision. */
|
||||
tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
|
||||
- if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
|
||||
- return false;
|
||||
|
||||
/* All addresses involved are known to have a common alignment ALIGN.
|
||||
We can therefore subtract ALIGN from an exclusive endpoint to get
|
||||
@@ -2370,9 +2354,6 @@ create_waw_or_war_checks (tree *cond_expr,
|
||||
fold_convert (ssizetype, indicator),
|
||||
ssize_int (0));
|
||||
|
||||
- /* Get lengths in sizetype. */
|
||||
- tree seg_len_a
|
||||
- = fold_convert (sizetype, rewrite_to_non_trapping_overflow (dr_a.seg_len));
|
||||
step = fold_convert (sizetype, rewrite_to_non_trapping_overflow (step));
|
||||
|
||||
/* Each access has the following pattern:
|
||||
@@ -2479,6 +2460,50 @@ create_waw_or_war_checks (tree *cond_expr,
|
||||
*cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject, limit);
|
||||
if (dump_enabled_p ())
|
||||
dump_printf (MSG_NOTE, "using an address-based WAR/WAW test\n");
|
||||
+}
|
||||
+
|
||||
+/* This is a wrapper function for create_waw_or_war_checks2. */
|
||||
+static bool
|
||||
+create_waw_or_war_checks (tree *cond_expr,
|
||||
+ const dr_with_seg_len_pair_t &alias_pair)
|
||||
+{
|
||||
+ const dr_with_seg_len& dr_a = alias_pair.first;
|
||||
+ const dr_with_seg_len& dr_b = alias_pair.second;
|
||||
+
|
||||
+ /* Check for cases in which:
|
||||
+
|
||||
+ (a) DR_B is always a write;
|
||||
+ (b) the accesses are well-ordered in both the original and new code
|
||||
+ (see the comment above the DR_ALIAS_* flags for details); and
|
||||
+ (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */
|
||||
+ if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
|
||||
+ return false;
|
||||
+
|
||||
+ /* Check for equal (but possibly variable) steps. */
|
||||
+ tree step = DR_STEP (dr_a.dr);
|
||||
+ if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
|
||||
+ return false;
|
||||
+
|
||||
+ /* Make sure that we can operate on sizetype without loss of precision. */
|
||||
+ tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
|
||||
+ if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
|
||||
+ return false;
|
||||
+
|
||||
+ /* Get lengths in sizetype. */
|
||||
+ tree seg_len_a
|
||||
+ = fold_convert (sizetype,
|
||||
+ rewrite_to_non_trapping_overflow (dr_a.seg_len));
|
||||
+ create_waw_or_war_checks2 (cond_expr, seg_len_a, alias_pair);
|
||||
+ if (param_flexible_seg_len && dr_a.seg_len != dr_a.seg_len2)
|
||||
+ {
|
||||
+ tree seg_len2_a
|
||||
+ = fold_convert (sizetype,
|
||||
+ rewrite_to_non_trapping_overflow (dr_a.seg_len2));
|
||||
+ tree cond_expr2;
|
||||
+ create_waw_or_war_checks2 (&cond_expr2, seg_len2_a, alias_pair);
|
||||
+ *cond_expr = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
|
||||
+ *cond_expr, cond_expr2);
|
||||
+ }
|
||||
return true;
|
||||
}
|
||||
|
||||
diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h
|
||||
index f643a95b2..9bc5f16ee 100644
|
||||
--- a/gcc/tree-data-ref.h
|
||||
+++ b/gcc/tree-data-ref.h
|
||||
@@ -213,12 +213,19 @@ class dr_with_seg_len
|
||||
public:
|
||||
dr_with_seg_len (data_reference_p d, tree len, unsigned HOST_WIDE_INT size,
|
||||
unsigned int a)
|
||||
- : dr (d), seg_len (len), access_size (size), align (a) {}
|
||||
-
|
||||
+ : dr (d), seg_len (len), seg_len2 (len), access_size (size), align (a)
|
||||
+ {}
|
||||
+ dr_with_seg_len (data_reference_p d, tree len, tree len2,
|
||||
+ unsigned HOST_WIDE_INT size, unsigned int a)
|
||||
+ : dr (d), seg_len (len), seg_len2 (len2), access_size (size), align (a)
|
||||
+ {}
|
||||
data_reference_p dr;
|
||||
/* The offset of the last access that needs to be checked minus
|
||||
the offset of the first. */
|
||||
tree seg_len;
|
||||
+ /* The second version of segment length. Currently this is used to
|
||||
+ soften checks for a small number of iterations. */
|
||||
+ tree seg_len2;
|
||||
/* A value that, when added to abs (SEG_LEN), gives the total number of
|
||||
bytes in the segment. */
|
||||
poly_uint64 access_size;
|
||||
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
|
||||
index 4e615b80b..04e68f621 100644
|
||||
--- a/gcc/tree-vect-data-refs.cc
|
||||
+++ b/gcc/tree-vect-data-refs.cc
|
||||
@@ -3646,6 +3646,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
|
||||
{
|
||||
poly_uint64 lower_bound;
|
||||
tree segment_length_a, segment_length_b;
|
||||
+ tree segment_length2_a, segment_length2_b;
|
||||
unsigned HOST_WIDE_INT access_size_a, access_size_b;
|
||||
unsigned int align_a, align_b;
|
||||
|
||||
@@ -3751,6 +3752,8 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
|
||||
{
|
||||
segment_length_a = size_zero_node;
|
||||
segment_length_b = size_zero_node;
|
||||
+ segment_length2_a = size_zero_node;
|
||||
+ segment_length2_b = size_zero_node;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -3759,8 +3762,15 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
|
||||
length_factor = scalar_loop_iters;
|
||||
else
|
||||
length_factor = size_int (vect_factor);
|
||||
+ /* In any case we should rememeber scalar_loop_iters
|
||||
+ this helps to create flexible aliasing check
|
||||
+ for small number of iterations. */
|
||||
segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
|
||||
segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
|
||||
+ segment_length2_a
|
||||
+ = vect_vfa_segment_size (dr_info_a, scalar_loop_iters);
|
||||
+ segment_length2_b
|
||||
+ = vect_vfa_segment_size (dr_info_b, scalar_loop_iters);
|
||||
}
|
||||
access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
|
||||
access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
|
||||
@@ -3805,9 +3815,9 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
|
||||
}
|
||||
|
||||
dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
|
||||
- access_size_a, align_a);
|
||||
+ segment_length2_a, access_size_a, align_a);
|
||||
dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
|
||||
- access_size_b, align_b);
|
||||
+ segment_length2_b, access_size_b, align_b);
|
||||
/* Canonicalize the order to be the one that's needed for accurate
|
||||
RAW, WAR and WAW flags, in cases where the data references are
|
||||
well-ordered. The order doesn't really matter otherwise,
|
||||
--
|
||||
2.33.0
|
||||
|
||||
2071
0050-Port-IPA-prefetch-to-GCC-12.patch
Normal file
2071
0050-Port-IPA-prefetch-to-GCC-12.patch
Normal file
File diff suppressed because it is too large
Load Diff
2216
0051-Port-fixes-for-IPA-prefetch-to-GCC-12.patch
Normal file
2216
0051-Port-fixes-for-IPA-prefetch-to-GCC-12.patch
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,94 @@
|
||||
From 0263daa1312d0cdcdf9c770bcf5d982a2d4fc16b Mon Sep 17 00:00:00 2001
|
||||
From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
|
||||
Date: Fri, 29 Mar 2024 17:15:41 +0800
|
||||
Subject: [PATCH 2/2] Fix fails in IPA prefetch (src-openEuler/gcc: I96ID7)
|
||||
|
||||
---
|
||||
gcc/ipa-prefetch.cc | 28 ++++++++++++++++++++++++++--
|
||||
1 file changed, 26 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/gcc/ipa-prefetch.cc b/gcc/ipa-prefetch.cc
|
||||
index 9537e4835..1ceb5137f 100644
|
||||
--- a/gcc/ipa-prefetch.cc
|
||||
+++ b/gcc/ipa-prefetch.cc
|
||||
@@ -366,6 +366,7 @@ typedef std::map<memref_t *, memref_t *> memref_map;
|
||||
typedef std::map<memref_t *, tree> memref_tree_map;
|
||||
|
||||
typedef std::set<gimple *> stmt_set;
|
||||
+typedef std::set<tree> tree_set;
|
||||
typedef std::map<tree, tree> tree_map;
|
||||
|
||||
tree_memref_map *tm_map;
|
||||
@@ -1124,8 +1125,21 @@ analyse_loops ()
|
||||
}
|
||||
}
|
||||
|
||||
+/* Compare memrefs by IDs; helper for qsort. */
|
||||
+
|
||||
+static int
|
||||
+memref_id_cmp (const void *p1, const void *p2)
|
||||
+{
|
||||
+ const memref_t *mr1 = *(const memref_t **) p1;
|
||||
+ const memref_t *mr2 = *(const memref_t **) p2;
|
||||
+
|
||||
+ if ((unsigned) mr1->mr_id > (unsigned) mr2->mr_id)
|
||||
+ return 1;
|
||||
+ return -1;
|
||||
+}
|
||||
+
|
||||
/* Reduce the set filtering out memrefs with the same memory references,
|
||||
- return the result vector of memrefs. */
|
||||
+ sort and return the result vector of memrefs. */
|
||||
|
||||
static void
|
||||
reduce_memref_set (memref_set *set, vec<memref_t *> &vec)
|
||||
@@ -1162,6 +1176,7 @@ reduce_memref_set (memref_set *set, vec<memref_t *> &vec)
|
||||
vec.safe_push (mr1);
|
||||
}
|
||||
}
|
||||
+ vec.qsort (memref_id_cmp);
|
||||
if (dump_file)
|
||||
{
|
||||
fprintf (dump_file, "MRs (%d) after filtering: ", vec.length ());
|
||||
@@ -1663,10 +1678,15 @@ optimize_function (cgraph_node *n, function *fn)
|
||||
}
|
||||
|
||||
/* Create other new vars. Insert new stmts. */
|
||||
+ vec<memref_t *> used_mr_vec = vNULL;
|
||||
for (memref_set::const_iterator it = used_mrs.begin ();
|
||||
it != used_mrs.end (); it++)
|
||||
+ used_mr_vec.safe_push (*it);
|
||||
+ used_mr_vec.qsort (memref_id_cmp);
|
||||
+
|
||||
+ for (unsigned int j = 0; j < used_mr_vec.length (); j++)
|
||||
{
|
||||
- memref_t *mr = *it;
|
||||
+ memref_t *mr = used_mr_vec[j];
|
||||
if (mr == comp_mr)
|
||||
continue;
|
||||
gimple *last_stmt = gimple_copy_and_remap_memref_stmts (mr, stmts, 0,
|
||||
@@ -1702,6 +1722,7 @@ optimize_function (cgraph_node *n, function *fn)
|
||||
local = integer_three_node;
|
||||
break;
|
||||
}
|
||||
+ tree_set prefetched_addrs;
|
||||
for (unsigned int j = 0; j < vmrs.length (); j++)
|
||||
{
|
||||
memref_t *mr = vmrs[j];
|
||||
@@ -1714,10 +1735,13 @@ optimize_function (cgraph_node *n, function *fn)
|
||||
tree addr = get_mem_ref_address_ssa_name (mr->mem, NULL_TREE);
|
||||
if (decl_map->count (addr))
|
||||
addr = (*decl_map)[addr];
|
||||
+ if (prefetched_addrs.count (addr))
|
||||
+ continue;
|
||||
last_stmt = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
|
||||
3, addr, write_p, local);
|
||||
pcalls.safe_push (last_stmt);
|
||||
gimple_seq_add_stmt (&stmts, last_stmt);
|
||||
+ prefetched_addrs.insert (addr);
|
||||
if (dump_file)
|
||||
{
|
||||
fprintf (dump_file, "Insert %d prefetch stmt:\n", j);
|
||||
--
|
||||
2.33.0
|
||||
|
||||
1366
0053-struct-reorg-Add-Semi-Relayout.patch
Normal file
1366
0053-struct-reorg-Add-Semi-Relayout.patch
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,28 @@
|
||||
From 9dc3df938b9ed2c27498c8548087fee1ce930366 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?=E9=83=91=E6=99=A8=E5=8D=89?= <zhengchenhui1@huawei.com>
|
||||
Date: Tue, 2 Apr 2024 11:08:30 +0800
|
||||
Subject: [PATCH] [Struct Reorg] Bugfix for structure pointer compression
|
||||
|
||||
---
|
||||
gcc/ipa-struct-reorg/ipa-struct-reorg.cc | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
|
||||
index fa33f2d35..3922873f3 100644
|
||||
--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
|
||||
+++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc
|
||||
@@ -7541,9 +7541,11 @@ ipa_struct_reorg::check_and_prune_struct_for_pointer_compression (void)
|
||||
if (!type->has_legal_alloc_num)
|
||||
{
|
||||
if (current_layout_opt_level & POINTER_COMPRESSION_UNSAFE)
|
||||
+ {
|
||||
if (dump_file)
|
||||
fprintf (dump_file, " has unknown alloc size, but"
|
||||
" in unsafe mode, so");
|
||||
+ }
|
||||
else
|
||||
{
|
||||
if (dump_file)
|
||||
--
|
||||
2.33.0
|
||||
|
||||
46
gcc.spec
46
gcc.spec
@ -2,7 +2,7 @@
|
||||
%global gcc_major 12
|
||||
# Note, gcc_release must be integer, if you want to add suffixes to
|
||||
# %%{release}, append them after %%{gcc_release} on Release: line.
|
||||
%global gcc_release 20
|
||||
%global gcc_release 21
|
||||
|
||||
%global _unpackaged_files_terminate_build 0
|
||||
%global _performance_build 1
|
||||
@ -172,6 +172,26 @@ Patch31: 0031-AutoBOLT-Support-saving-feedback-count-info-to-ELF-s.patch
|
||||
Patch32: 0032-AutoBOLT-Add-bolt-linker-plugin-2-3.patch
|
||||
Patch33: 0033-AutoBOLT-Enable-BOLT-linker-plugin-on-aarch64-3-3.patch
|
||||
Patch34: 0034-Autofdo-Enable-discrimibator-and-MCF-algorithm-on-Au.patch
|
||||
Patch35: 0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch
|
||||
Patch36: 0036-rtl-ifcvt-introduce-rtl-ifcvt-enchancements.patch
|
||||
Patch37: 0037-Perform-early-if-conversion-of-simple-arithmetic.patch
|
||||
Patch38: 0038-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch
|
||||
Patch39: 0039-Match-double-sized-mul-pattern.patch
|
||||
Patch40: 0040-Port-icp-patch-to-GCC-12.patch
|
||||
Patch41: 0041-Port-fixes-in-icp-to-GCC-12.patch
|
||||
Patch42: 0042-Add-split-complex-instructions-pass.patch
|
||||
Patch43: 0043-Extending-and-refactoring-of-pass_split_complex_inst.patch
|
||||
Patch44: 0044-Port-maxmin-patch-to-GCC-12.patch
|
||||
Patch45: 0045-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch
|
||||
Patch46: 0046-Add-new-pattern-to-pass-the-maxmin-tests.patch
|
||||
Patch47: 0047-AES-Implement-AES-pattern-matching.patch
|
||||
Patch48: 0048-crypto-accel-add-optimization-level-requirement-to-t.patch
|
||||
Patch49: 0049-Add-more-flexible-check-for-pointer-aliasing-during-.patch
|
||||
Patch50: 0050-Port-IPA-prefetch-to-GCC-12.patch
|
||||
Patch51: 0051-Port-fixes-for-IPA-prefetch-to-GCC-12.patch
|
||||
Patch52: 0052-Fix-fails-in-IPA-prefetch-src-openEuler-gcc-I96ID7.patch
|
||||
Patch53: 0053-struct-reorg-Add-Semi-Relayout.patch
|
||||
Patch54: 0054-Struct-Reorg-Bugfix-for-structure-pointer-compressio.patch
|
||||
|
||||
# Part 3000 ~ 4999
|
||||
%ifarch loongarch64
|
||||
@ -801,6 +821,26 @@ not stable, so plugins must be rebuilt any time GCC is updated.
|
||||
%patch32 -p1
|
||||
%patch33 -p1
|
||||
%patch34 -p1
|
||||
%patch35 -p1
|
||||
%patch36 -p1
|
||||
%patch37 -p1
|
||||
%patch38 -p1
|
||||
%patch39 -p1
|
||||
%patch40 -p1
|
||||
%patch41 -p1
|
||||
%patch42 -p1
|
||||
%patch43 -p1
|
||||
%patch44 -p1
|
||||
%patch45 -p1
|
||||
%patch46 -p1
|
||||
%patch47 -p1
|
||||
%patch48 -p1
|
||||
%patch49 -p1
|
||||
%patch50 -p1
|
||||
%patch51 -p1
|
||||
%patch52 -p1
|
||||
%patch53 -p1
|
||||
%patch54 -p1
|
||||
|
||||
%ifarch loongarch64
|
||||
%patch3001 -p1
|
||||
@ -3186,6 +3226,10 @@ end
|
||||
%doc rpm.doc/changelogs/libcc1/ChangeLog*
|
||||
|
||||
%changelog
|
||||
* Thu Apr 11 2024 Zhengchen Hui <zhengchenhui1@huawei.com> - 12.3.1-21
|
||||
- Type: Sync
|
||||
- DESC: Sync patch from openeuler/gcc
|
||||
|
||||
* Thu Apr 11 2024 Zhenyu Zhao <zhaozhenyu17@huawei.com> - 12.3.1-20
|
||||
- Type: Sync
|
||||
- DESC: Sync patch from openeuler/gcc
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user