240 lines
9.9 KiB
Diff
240 lines
9.9 KiB
Diff
|
|
From b5865aef36ebaac87ae30d51f08bfe081795ed67 Mon Sep 17 00:00:00 2001
|
||
|
|
From: Chernonog Viacheslav <chernonog.vyacheslav@huawei.com>
|
||
|
|
Date: Tue, 12 Mar 2024 23:30:56 +0800
|
||
|
|
Subject: [PATCH 17/18] Add more flexible check for pointer aliasing during
|
||
|
|
vectorization It takes minimum between number of iteration and segment length
|
||
|
|
it helps to speed up loops with small number of iterations when only tail can
|
||
|
|
be vectorized
|
||
|
|
|
||
|
|
---
|
||
|
|
gcc/params.opt | 5 ++
|
||
|
|
.../sve/var_stride_flexible_segment_len_1.c | 23 +++++++
|
||
|
|
gcc/tree-data-ref.cc | 67 +++++++++++++------
|
||
|
|
gcc/tree-data-ref.h | 11 ++-
|
||
|
|
gcc/tree-vect-data-refs.cc | 14 +++-
|
||
|
|
5 files changed, 95 insertions(+), 25 deletions(-)
|
||
|
|
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
|
||
|
|
|
||
|
|
diff --git a/gcc/params.opt b/gcc/params.opt
|
||
|
|
index 6176d4790..7e5c119cf 100644
|
||
|
|
--- a/gcc/params.opt
|
||
|
|
+++ b/gcc/params.opt
|
||
|
|
@@ -1180,6 +1180,11 @@ Maximum number of loop peels to enhance alignment of data references in a loop.
|
||
|
|
Common Joined UInteger Var(param_vect_max_version_for_alias_checks) Init(10) Param Optimization
|
||
|
|
Bound on number of runtime checks inserted by the vectorizer's loop versioning for alias check.
|
||
|
|
|
||
|
|
+-param=vect-alias-flexible-segment-len=
|
||
|
|
+Common Joined UInteger Var(param_flexible_seg_len) Init(0) IntegerRange(0, 1) Param Optimization
|
||
|
|
+Use a minimum length of different segments. Currenlty the minimum between
|
||
|
|
+iteration number and vectorization length is chosen by this param.
|
||
|
|
+
|
||
|
|
-param=vect-max-version-for-alignment-checks=
|
||
|
|
Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization
|
||
|
|
Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check.
|
||
|
|
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
|
||
|
|
new file mode 100644
|
||
|
|
index 000000000..894f075f3
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
|
||
|
|
@@ -0,0 +1,23 @@
|
||
|
|
+/* { dg-do compile } */
|
||
|
|
+/* { dg-options "-O2 -ftree-vectorize --param=vect-alias-flexible-segment-len=1" } */
|
||
|
|
+
|
||
|
|
+#define TYPE int
|
||
|
|
+#define SIZE 257
|
||
|
|
+
|
||
|
|
+void __attribute__ ((weak))
|
||
|
|
+f (TYPE *x, TYPE *y, unsigned short n, long m __attribute__((unused)))
|
||
|
|
+{
|
||
|
|
+ for (int i = 0; i < SIZE; ++i)
|
||
|
|
+ x[i * n] += y[i * n];
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+/* { dg-final { scan-assembler {\tld1w\tz[0-9]+} } } */
|
||
|
|
+/* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */
|
||
|
|
+/* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
|
||
|
|
+/* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
|
||
|
|
+/* Should use a WAR check that multiplies by (VF-2)*4 rather than
|
||
|
|
+ an overlap check that multiplies by (257-1)*4. */
|
||
|
|
+/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #8\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */
|
||
|
|
+/* One range check and a check for n being zero. */
|
||
|
|
+/* { dg-final { scan-assembler-times {\t(?:cmp|tst)\t} 2 } } */
|
||
|
|
+/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */
|
||
|
|
diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
|
||
|
|
index 397792c35..e6ae9e847 100644
|
||
|
|
--- a/gcc/tree-data-ref.cc
|
||
|
|
+++ b/gcc/tree-data-ref.cc
|
||
|
|
@@ -2329,31 +2329,15 @@ create_intersect_range_checks_index (class loop *loop, tree *cond_expr,
|
||
|
|
same arguments. Try to optimize cases in which the second access
|
||
|
|
is a write and in which some overlap is valid. */
|
||
|
|
|
||
|
|
-static bool
|
||
|
|
-create_waw_or_war_checks (tree *cond_expr,
|
||
|
|
+static void
|
||
|
|
+create_waw_or_war_checks2 (tree *cond_expr, tree seg_len_a,
|
||
|
|
const dr_with_seg_len_pair_t &alias_pair)
|
||
|
|
{
|
||
|
|
const dr_with_seg_len& dr_a = alias_pair.first;
|
||
|
|
const dr_with_seg_len& dr_b = alias_pair.second;
|
||
|
|
|
||
|
|
- /* Check for cases in which:
|
||
|
|
-
|
||
|
|
- (a) DR_B is always a write;
|
||
|
|
- (b) the accesses are well-ordered in both the original and new code
|
||
|
|
- (see the comment above the DR_ALIAS_* flags for details); and
|
||
|
|
- (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */
|
||
|
|
- if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
|
||
|
|
- return false;
|
||
|
|
-
|
||
|
|
- /* Check for equal (but possibly variable) steps. */
|
||
|
|
tree step = DR_STEP (dr_a.dr);
|
||
|
|
- if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
|
||
|
|
- return false;
|
||
|
|
-
|
||
|
|
- /* Make sure that we can operate on sizetype without loss of precision. */
|
||
|
|
tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
|
||
|
|
- if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
|
||
|
|
- return false;
|
||
|
|
|
||
|
|
/* All addresses involved are known to have a common alignment ALIGN.
|
||
|
|
We can therefore subtract ALIGN from an exclusive endpoint to get
|
||
|
|
@@ -2370,9 +2354,6 @@ create_waw_or_war_checks (tree *cond_expr,
|
||
|
|
fold_convert (ssizetype, indicator),
|
||
|
|
ssize_int (0));
|
||
|
|
|
||
|
|
- /* Get lengths in sizetype. */
|
||
|
|
- tree seg_len_a
|
||
|
|
- = fold_convert (sizetype, rewrite_to_non_trapping_overflow (dr_a.seg_len));
|
||
|
|
step = fold_convert (sizetype, rewrite_to_non_trapping_overflow (step));
|
||
|
|
|
||
|
|
/* Each access has the following pattern:
|
||
|
|
@@ -2479,6 +2460,50 @@ create_waw_or_war_checks (tree *cond_expr,
|
||
|
|
*cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject, limit);
|
||
|
|
if (dump_enabled_p ())
|
||
|
|
dump_printf (MSG_NOTE, "using an address-based WAR/WAW test\n");
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+/* This is a wrapper function for create_waw_or_war_checks2. */
|
||
|
|
+static bool
|
||
|
|
+create_waw_or_war_checks (tree *cond_expr,
|
||
|
|
+ const dr_with_seg_len_pair_t &alias_pair)
|
||
|
|
+{
|
||
|
|
+ const dr_with_seg_len& dr_a = alias_pair.first;
|
||
|
|
+ const dr_with_seg_len& dr_b = alias_pair.second;
|
||
|
|
+
|
||
|
|
+ /* Check for cases in which:
|
||
|
|
+
|
||
|
|
+ (a) DR_B is always a write;
|
||
|
|
+ (b) the accesses are well-ordered in both the original and new code
|
||
|
|
+ (see the comment above the DR_ALIAS_* flags for details); and
|
||
|
|
+ (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */
|
||
|
|
+ if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
|
||
|
|
+ return false;
|
||
|
|
+
|
||
|
|
+ /* Check for equal (but possibly variable) steps. */
|
||
|
|
+ tree step = DR_STEP (dr_a.dr);
|
||
|
|
+ if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
|
||
|
|
+ return false;
|
||
|
|
+
|
||
|
|
+ /* Make sure that we can operate on sizetype without loss of precision. */
|
||
|
|
+ tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
|
||
|
|
+ if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
|
||
|
|
+ return false;
|
||
|
|
+
|
||
|
|
+ /* Get lengths in sizetype. */
|
||
|
|
+ tree seg_len_a
|
||
|
|
+ = fold_convert (sizetype,
|
||
|
|
+ rewrite_to_non_trapping_overflow (dr_a.seg_len));
|
||
|
|
+ create_waw_or_war_checks2 (cond_expr, seg_len_a, alias_pair);
|
||
|
|
+ if (param_flexible_seg_len && dr_a.seg_len != dr_a.seg_len2)
|
||
|
|
+ {
|
||
|
|
+ tree seg_len2_a
|
||
|
|
+ = fold_convert (sizetype,
|
||
|
|
+ rewrite_to_non_trapping_overflow (dr_a.seg_len2));
|
||
|
|
+ tree cond_expr2;
|
||
|
|
+ create_waw_or_war_checks2 (&cond_expr2, seg_len2_a, alias_pair);
|
||
|
|
+ *cond_expr = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
|
||
|
|
+ *cond_expr, cond_expr2);
|
||
|
|
+ }
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
|
||
|
|
diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h
|
||
|
|
index f643a95b2..9bc5f16ee 100644
|
||
|
|
--- a/gcc/tree-data-ref.h
|
||
|
|
+++ b/gcc/tree-data-ref.h
|
||
|
|
@@ -213,12 +213,19 @@ class dr_with_seg_len
|
||
|
|
public:
|
||
|
|
dr_with_seg_len (data_reference_p d, tree len, unsigned HOST_WIDE_INT size,
|
||
|
|
unsigned int a)
|
||
|
|
- : dr (d), seg_len (len), access_size (size), align (a) {}
|
||
|
|
-
|
||
|
|
+ : dr (d), seg_len (len), seg_len2 (len), access_size (size), align (a)
|
||
|
|
+ {}
|
||
|
|
+ dr_with_seg_len (data_reference_p d, tree len, tree len2,
|
||
|
|
+ unsigned HOST_WIDE_INT size, unsigned int a)
|
||
|
|
+ : dr (d), seg_len (len), seg_len2 (len2), access_size (size), align (a)
|
||
|
|
+ {}
|
||
|
|
data_reference_p dr;
|
||
|
|
/* The offset of the last access that needs to be checked minus
|
||
|
|
the offset of the first. */
|
||
|
|
tree seg_len;
|
||
|
|
+ /* The second version of segment length. Currently this is used to
|
||
|
|
+ soften checks for a small number of iterations. */
|
||
|
|
+ tree seg_len2;
|
||
|
|
/* A value that, when added to abs (SEG_LEN), gives the total number of
|
||
|
|
bytes in the segment. */
|
||
|
|
poly_uint64 access_size;
|
||
|
|
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
|
||
|
|
index 4e615b80b..04e68f621 100644
|
||
|
|
--- a/gcc/tree-vect-data-refs.cc
|
||
|
|
+++ b/gcc/tree-vect-data-refs.cc
|
||
|
|
@@ -3646,6 +3646,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
|
||
|
|
{
|
||
|
|
poly_uint64 lower_bound;
|
||
|
|
tree segment_length_a, segment_length_b;
|
||
|
|
+ tree segment_length2_a, segment_length2_b;
|
||
|
|
unsigned HOST_WIDE_INT access_size_a, access_size_b;
|
||
|
|
unsigned int align_a, align_b;
|
||
|
|
|
||
|
|
@@ -3751,6 +3752,8 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
|
||
|
|
{
|
||
|
|
segment_length_a = size_zero_node;
|
||
|
|
segment_length_b = size_zero_node;
|
||
|
|
+ segment_length2_a = size_zero_node;
|
||
|
|
+ segment_length2_b = size_zero_node;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
@@ -3759,8 +3762,15 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
|
||
|
|
length_factor = scalar_loop_iters;
|
||
|
|
else
|
||
|
|
length_factor = size_int (vect_factor);
|
||
|
|
+ /* In any case we should rememeber scalar_loop_iters
|
||
|
|
+ this helps to create flexible aliasing check
|
||
|
|
+ for small number of iterations. */
|
||
|
|
segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
|
||
|
|
segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
|
||
|
|
+ segment_length2_a
|
||
|
|
+ = vect_vfa_segment_size (dr_info_a, scalar_loop_iters);
|
||
|
|
+ segment_length2_b
|
||
|
|
+ = vect_vfa_segment_size (dr_info_b, scalar_loop_iters);
|
||
|
|
}
|
||
|
|
access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
|
||
|
|
access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
|
||
|
|
@@ -3805,9 +3815,9 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
|
||
|
|
}
|
||
|
|
|
||
|
|
dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
|
||
|
|
- access_size_a, align_a);
|
||
|
|
+ segment_length2_a, access_size_a, align_a);
|
||
|
|
dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
|
||
|
|
- access_size_b, align_b);
|
||
|
|
+ segment_length2_b, access_size_b, align_b);
|
||
|
|
/* Canonicalize the order to be the one that's needed for accurate
|
||
|
|
RAW, WAR and WAW flags, in cases where the data references are
|
||
|
|
well-ordered. The order doesn't really matter otherwise,
|
||
|
|
--
|
||
|
|
2.33.0
|
||
|
|
|