gcc/0081-Loop-distribution-Insert-temp-arrays-built-from-isom.patch

827 lines
27 KiB
Diff
Raw Normal View History

From ca2a541ed3425bec64f97fe277c6c02bf4f20049 Mon Sep 17 00:00:00 2001
From: benniaobufeijiushiji <linda7@huawei.com>
Date: Thu, 27 Oct 2022 10:26:34 +0800
Subject: [PATCH 33/35] [Loop-distribution] Insert temp arrays built from
isomorphic stmts Use option -ftree-slp-transpose-vectorize Build temp arrays
for isomorphic stmt and regard them as new seed_stmts for loop distribution.
---
gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c | 67 +++
gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c | 17 +
gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c | 19 +
gcc/tree-loop-distribution.c | 577 +++++++++++++++++++-
4 files changed, 663 insertions(+), 17 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c
create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c
create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c
new file mode 100644
index 000000000..649463647
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c
@@ -0,0 +1,67 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-do run { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -ftree-slp-transpose-vectorize -fdump-tree-ldist-all-details -save-temps" } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+static unsigned inline abs2 (unsigned a)
+{
+ unsigned s = ((a>>15)&0x10001)*0xffff;
+ return (a+s)^s;
+}
+
+int foo (unsigned char *oxa, int ia, unsigned char *oxb, int ib)
+{
+ unsigned tmp[4][4];
+ unsigned a0, a1, a2, a3;
+ int sum = 0;
+ for (int i = 0; i < 4; i++, oxa += ia, oxb += ib)
+ {
+ a0 = (oxa[0] - oxb[0]) + ((oxa[4] - oxb[4]) << 16);
+ a1 = (oxa[1] - oxb[1]) + ((oxa[5] - oxb[5]) << 16);
+ a2 = (oxa[2] - oxb[2]) + ((oxa[6] - oxb[6]) << 16);
+ a3 = (oxa[3] - oxb[3]) + ((oxa[7] - oxb[7]) << 16);
+ int t0 = a0 + a1;
+ int t1 = a0 - a1;
+ int t2 = a2 + a3;
+ int t3 = a2 - a3;
+ tmp[i][0] = t0 + t2;
+ tmp[i][2] = t0 - t2;
+ tmp[i][1] = t1 + t3;
+ tmp[i][3] = t1 - t3;
+ }
+ for (int i = 0; i < 4; i++)
+ {
+ int t0 = tmp[0][i] + tmp[1][i];
+ int t1 = tmp[0][i] - tmp[1][i];
+ int t2 = tmp[2][i] + tmp[3][i];
+ int t3 = tmp[2][i] - tmp[3][i];
+ a0 = t0 + t2;
+ a2 = t0 - t2;
+ a1 = t1 + t3;
+ a3 = t1 - t3;
+ sum += abs2 (a0) + abs2 (a1) + abs2 (a2) + abs2 (a3);
+ }
+ return (((unsigned short) sum) + ((unsigned) sum >>16)) >> 1;
+}
+
+int main ()
+{
+ unsigned char oxa[128] = {0};
+ unsigned char oxb[128] = {0};
+ for (int i = 0; i < 128; i++)
+ {
+ oxa[i] += i * 3;
+ oxb[i] = i * 2;
+ }
+ int sum = foo (oxa, 16, oxb, 32);
+ if (sum != 736)
+ {
+ abort ();
+ }
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "Insertion done: 4 temp arrays inserted" 1 "ldist" } } */
+/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 1 "ldist" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c
new file mode 100644
index 000000000..1b50fd27d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c
@@ -0,0 +1,17 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -ftree-slp-transpose-vectorize -fdump-tree-ldist-all-details" } */
+
+unsigned a0[4], a1[4], a2[4], a3[4];
+
+void foo (unsigned char *oxa, int ia, unsigned char *oxb, int ib)
+{
+ for (int i = 0; i < 4; i++, oxa += ia, oxb += ib)
+ {
+ a0[i] = (oxa[0] - oxb[0]) + ((oxa[4] - oxb[4]) << 16);
+ a1[i] = (oxa[1] - oxb[1]) + ((oxa[5] - oxb[5]) << 16);
+ a2[i] = (oxa[2] - oxb[2]) + ((oxa[6] - oxb[6]) << 16);
+ a3[i] = (oxa[3] - oxb[3]) + ((oxa[7] - oxb[7]) << 16);
+ }
+}
+
+/* { dg-final { scan-tree-dump-times "Loop 1 not distributed." 1 "ldist" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c
new file mode 100644
index 000000000..94b992b05
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c
@@ -0,0 +1,19 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-options "-O3 -ftree-slp-transpose-vectorize -fdump-tree-ldist-all-details" } */
+
+unsigned a0[4], a1[4], a2[4], a3[4];
+
+void foo (unsigned char *oxa, int ia, unsigned char *oxb, int ib)
+{
+ for (int i = 0; i < 4; i++, oxa += ia, oxb += ib)
+ {
+ a0[i] = ((oxa[0] - oxb[0]) + ((oxa[4] - oxb[4]) << 16)) + 1;
+ a1[i] = ((oxa[1] - oxb[1]) + ((oxa[5] - oxb[5]) << 16)) - 2;
+ a2[i] = ((oxa[2] - oxb[2]) + ((oxa[6] - oxb[6]) << 16)) * 3;
+ a3[i] = ((oxa[3] - oxb[3]) + ((oxa[7] - oxb[7]) << 16)) / 4;
+ }
+}
+
+/* { dg-final { scan-tree-dump-times "Insertion done: 4 temp arrays inserted" 1 "ldist" } } */
+/* { dg-final { scan-tree-dump-times "Insertion removed" 1 "ldist" } } */
+/* { dg-final { scan-tree-dump-times "Loop 1 not distributed." 1 "ldist" } } */
\ No newline at end of file
diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c
index c08af6562..88b56379c 100644
--- a/gcc/tree-loop-distribution.c
+++ b/gcc/tree-loop-distribution.c
@@ -36,6 +36,47 @@ along with GCC; see the file COPYING3. If not see
| D(I) = A(I-1)*E
|ENDDO
+ If an unvectorizable loop has grouped loads, and calculations from grouped
+ loads are isomorphic, build temp arrays using stmts where isomorphic
+ calculations end. Afer distribution, the partition built from temp
+ arrays can be vectorized in pass SLP after loop unrolling. For example,
+
+ |DO I = 1, N
+ | A = FOO (ARG_1);
+ | B = FOO (ARG_2);
+ | C = BAR_0 (A);
+ | D = BAR_1 (B);
+ |ENDDO
+
+ is transformed to
+
+ |DO I = 1, N
+ | J = FOO (ARG_1);
+ | K = FOO (ARG_2);
+ | X[I] = J;
+ | Y[I] = K;
+ | A = X[I];
+ | B = Y[I];
+ | C = BAR_0 (A);
+ | D = BAR_1 (B);
+ |ENDDO
+
+ and is then distributed to
+
+ |DO I = 1, N
+ | J = FOO (ARG_1);
+ | K = FOO (ARG_2);
+ | X[I] = J;
+ | Y[I] = K;
+ |ENDDO
+
+ |DO I = 1, N
+ | A = X[I];
+ | B = Y[I];
+ | C = BAR_0 (A);
+ | D = BAR_1 (B);
+ |ENDDO
+
Loop distribution is the dual of loop fusion. It separates statements
of a loop (or loop nest) into multiple loops (or loop nests) with the
same loop header. The major goal is to separate statements which may
@@ -44,7 +85,9 @@ along with GCC; see the file COPYING3. If not see
1) Seed partitions with specific type statements. For now we support
two types seed statements: statement defining variable used outside
- of loop; statement storing to memory.
+ of loop; statement storing to memory. Moreover, for unvectorizable
+ loops, we try to find isomorphic stmts from grouped load and build
+ temp arrays as new seed statements.
2) Build reduced dependence graph (RDG) for loop to be distributed.
The vertices (RDG:V) model all statements in the loop and the edges
(RDG:E) model flow and control dependencies between statements.
@@ -643,7 +686,8 @@ class loop_distribution
/* Returns true when PARTITION1 and PARTITION2 access the same memory
object in RDG. */
bool share_memory_accesses (struct graph *rdg,
- partition *partition1, partition *partition2);
+ partition *partition1, partition *partition2,
+ hash_set<tree> *excluded_arrays);
/* For each seed statement in STARTING_STMTS, this function builds
partition for it by adding depended statements according to RDG.
@@ -686,8 +730,9 @@ class loop_distribution
/* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
ALIAS_DDRS contains ddrs which need runtime alias check. */
- void finalize_partitions (class loop *loop, vec<struct partition *>
- *partitions, vec<ddr_p> *alias_ddrs);
+ void finalize_partitions (class loop *loop,
+ vec<struct partition *> *partitions,
+ vec<ddr_p> *alias_ddrs, bitmap producers);
/* Analyze loop form and if it's vectorizable to decide if we need to
insert temp arrays to distribute it. */
@@ -701,6 +746,28 @@ class loop_distribution
inline void rebuild_rdg (loop_p loop, struct graph *&rdg,
control_dependences *cd);
+
+ /* If loop is not distributed, remove inserted temp arrays. */
+ void remove_insertion (loop_p loop, struct graph *flow_only_rdg,
+ bitmap producers, struct partition *partition);
+
+ /* Insert temp arrays if isomorphic computation exists. Temp arrays will be
+ regarded as SEED_STMTS for building partitions in succeeding processes. */
+ bool insert_temp_arrays (loop_p loop, vec<gimple *> seed_stmts,
+ hash_set<tree> *tmp_array_vars, bitmap producers);
+
+ void build_producers (loop_p loop, bitmap producers,
+ vec<gimple *> &transformed);
+
+ void do_insertion (loop_p loop, struct graph *flow_only_rdg, tree iv,
+ bitmap cut_points, hash_set <tree> *tmp_array_vars,
+ bitmap producers);
+
+ /* Fuse PARTITIONS built from inserted temp arrays into one partition,
+ fuse the rest into another. */
+ void merge_remaining_partitions (vec<struct partition *> *partitions,
+ bitmap producers);
+
/* Distributes the code from LOOP in such a way that producer statements
are placed before consumer statements. Tries to separate only the
statements from STMTS into separate loops. Returns the number of
@@ -1913,7 +1980,8 @@ loop_distribution::classify_partition (loop_p loop,
bool
loop_distribution::share_memory_accesses (struct graph *rdg,
- partition *partition1, partition *partition2)
+ partition *partition1, partition *partition2,
+ hash_set <tree> *excluded_arrays)
{
unsigned i, j;
bitmap_iterator bi, bj;
@@ -1947,7 +2015,10 @@ loop_distribution::share_memory_accesses (struct graph *rdg,
if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
&& operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
&& operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
- && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
+ && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0)
+ /* An exception, if PARTITION1 and PARTITION2 contain the
+ temp array we inserted, do not merge them. */
+ && !excluded_arrays->contains (DR_REF (dr1)))
return true;
}
}
@@ -2909,13 +2980,47 @@ fuse_memset_builtins (vec<struct partition *> *partitions)
}
}
+void
+loop_distribution::merge_remaining_partitions
+ (vec<struct partition *> *partitions,
+ bitmap producers)
+{
+ struct partition *partition = NULL;
+ struct partition *p1 = NULL, *p2 = NULL;
+ for (unsigned i = 0; partitions->iterate (i, &partition); i++)
+ {
+ if (bitmap_intersect_p (producers, partition->stmts))
+ {
+ if (p1 == NULL)
+ {
+ p1 = partition;
+ continue;
+ }
+ partition_merge_into (NULL, p1, partition, FUSE_FINALIZE);
+ }
+ else
+ {
+ if (p2 == NULL)
+ {
+ p2 = partition;
+ continue;
+ }
+ partition_merge_into (NULL, p2, partition, FUSE_FINALIZE);
+ }
+ partitions->unordered_remove (i);
+ partition_free (partition);
+ i--;
+ }
+}
+
void
loop_distribution::finalize_partitions (class loop *loop,
vec<struct partition *> *partitions,
- vec<ddr_p> *alias_ddrs)
+ vec<ddr_p> *alias_ddrs,
+ bitmap producers)
{
unsigned i;
- struct partition *partition, *a;
+ struct partition *partition;
if (partitions->length () == 1
|| alias_ddrs->length () > 0)
@@ -2947,13 +3052,7 @@ loop_distribution::finalize_partitions (class loop *loop,
|| (loop->inner == NULL
&& i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
{
- a = (*partitions)[0];
- for (i = 1; partitions->iterate (i, &partition); ++i)
- {
- partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
- partition_free (partition);
- }
- partitions->truncate (1);
+ merge_remaining_partitions (partitions, producers);
}
/* Fuse memset builtins if possible. */
@@ -3758,6 +3857,404 @@ find_isomorphic_stmts (loop_vec_info vinfo, vec<gimple *> &stmts)
return decide_stmts_by_profit (candi_stmts, stmts);
}
+/* Get iv from SEED_STMTS and make sure each seed_stmt has only one iv as index
+ and all indices are the same. */
+
+static tree
+find_index (vec<gimple *> seed_stmts)
+{
+ if (seed_stmts.length () == 0)
+ return NULL;
+ bool found_index = false;
+ tree index = NULL;
+ unsigned ui = 0;
+ for (ui = 0; ui < seed_stmts.length (); ui++)
+ {
+ if (!gimple_vdef (seed_stmts[ui]))
+ return NULL;
+ tree lhs = gimple_assign_lhs (seed_stmts[ui]);
+ unsigned num_index = 0;
+ while (TREE_CODE (lhs) == ARRAY_REF)
+ {
+ if (TREE_CODE (TREE_OPERAND (lhs, 1)) == SSA_NAME)
+ {
+ num_index++;
+ if (num_index > 1)
+ return NULL;
+ if (index == NULL)
+ {
+ index = TREE_OPERAND (lhs, 1);
+ found_index = true;
+ }
+ else if (index != TREE_OPERAND (lhs, 1))
+ return NULL;
+ }
+ lhs = TREE_OPERAND (lhs, 0);
+ }
+ if (!found_index)
+ return NULL;
+ }
+ return index;
+}
+
+/* Check if expression of phi is an increament of a const. */
+
+static void
+check_phi_inc (struct vertex *v_phi, struct graph *rdg, bool &found_inc)
+{
+ struct graph_edge *e_phi;
+ for (e_phi = v_phi->succ; e_phi; e_phi = e_phi->succ_next)
+ {
+ struct vertex *v_inc = &(rdg->vertices[e_phi->dest]);
+ if (!is_gimple_assign (RDGV_STMT (v_inc))
+ || gimple_expr_code (RDGV_STMT (v_inc)) != PLUS_EXPR)
+ continue;
+ tree rhs1 = gimple_assign_rhs1 (RDGV_STMT (v_inc));
+ tree rhs2 = gimple_assign_rhs2 (RDGV_STMT (v_inc));
+ if (!(integer_onep (rhs1) || integer_onep (rhs2)))
+ continue;
+ struct graph_edge *e_inc;
+ /* find cycle with only two vertices inc and phi: inc <--> phi. */
+ bool found_cycle = false;
+ for (e_inc = v_inc->succ; e_inc; e_inc = e_inc->succ_next)
+ {
+ if (e_inc->dest == e_phi->src)
+ {
+ found_cycle = true;
+ break;
+ }
+ }
+ if (!found_cycle)
+ continue;
+ found_inc = true;
+ }
+}
+
+/* Check if phi satisfies form like PHI <0, i>. */
+
+static inline bool
+iv_check_phi_stmt (gimple *phi_stmt)
+{
+ return gimple_phi_num_args (phi_stmt) == 2
+ && (integer_zerop (gimple_phi_arg_def (phi_stmt, 0))
+ || integer_zerop (gimple_phi_arg_def (phi_stmt, 1)));
+}
+
+/* Make sure the iteration varible is a phi. */
+
+static tree
+get_iv_from_seed (struct graph *flow_only_rdg, vec<gimple *> seed_stmts)
+{
+ tree index = find_index (seed_stmts);
+ if (index == NULL)
+ return NULL;
+ for (int i = 0; i < flow_only_rdg->n_vertices; i++)
+ {
+ struct vertex *v = &(flow_only_rdg->vertices[i]);
+ if (RDGV_STMT (v) != seed_stmts[0])
+ continue;
+ struct graph_edge *e;
+ bool found_phi = false;
+ for (e = v->pred; e; e = e->pred_next)
+ {
+ struct vertex *v_phi = &(flow_only_rdg->vertices[e->src]);
+ gimple *phi_stmt = RDGV_STMT (v_phi);
+ if (gimple_code (phi_stmt) != GIMPLE_PHI
+ || gimple_phi_result (phi_stmt) != index)
+ continue;
+ if (!iv_check_phi_stmt (phi_stmt))
+ return NULL;
+ /* find inc expr in succ of phi. */
+ bool found_inc = false;
+ check_phi_inc (v_phi, flow_only_rdg, found_inc);
+ if (!found_inc)
+ return NULL;
+ found_phi = true;
+ break;
+ }
+ if (!found_phi)
+ return NULL;
+ break;
+ }
+ return index;
+}
+
+/* Do not distribute loop if vertexes in ROOT_MAP have antidependence with in
+ FLOW_ONLY_RDG. */
+
+static bool
+check_no_dependency (struct graph *flow_only_rdg, bitmap root_map)
+{
+ bitmap_iterator bi;
+ unsigned ui;
+ auto_vec<unsigned, 16> visited_nodes;
+ auto_bitmap visited_map;
+ EXECUTE_IF_SET_IN_BITMAP (root_map, 0, ui, bi)
+ visited_nodes.safe_push (ui);
+ for (ui = 0; ui < visited_nodes.length (); ui++)
+ {
+ struct vertex *v = &(flow_only_rdg->vertices[visited_nodes[ui]]);
+ struct graph_edge *e;
+ for (e = v->succ; e; e = e->succ_next)
+ {
+ if (bitmap_bit_p (root_map, e->dest))
+ return false;
+ if (bitmap_bit_p (visited_map, e->dest))
+ continue;
+ visited_nodes.safe_push (e->dest);
+ bitmap_set_bit (visited_map, e->dest);
+ }
+ }
+ return true;
+}
+
+/* Find isomorphic stmts from GROUPED_LOADS in VINFO and make sure
+ there is no dependency among those STMT we found. */
+
+static unsigned
+get_cut_points (struct graph *flow_only_rdg, bitmap cut_points,
+ loop_vec_info vinfo)
+{
+ unsigned n_stmts = 0;
+
+ /* STMTS that may be CUT_POINTS. */
+ auto_vec<gimple *> stmts;
+ if (!find_isomorphic_stmts (vinfo, stmts))
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "No temp array insertion: no isomorphic stmts"
+ " were found.\n");
+ return 0;
+ }
+
+ for (int i = 0; i < flow_only_rdg->n_vertices; i++)
+ {
+ if (stmts.contains (RDG_STMT (flow_only_rdg, i)))
+ bitmap_set_bit (cut_points, i);
+ }
+ n_stmts = bitmap_count_bits (cut_points);
+
+ bool succ = check_no_dependency (flow_only_rdg, cut_points);
+ if (!succ)
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "No temp array inserted: data dependency"
+ " among isomorphic stmts.\n");
+ return 0;
+ }
+ return n_stmts;
+}
+
+static void
+build_temp_array (struct vertex *v, gimple_stmt_iterator &gsi,
+ poly_uint64 array_extent, tree iv,
+ hash_set<tree> *tmp_array_vars, vec<gimple *> *transformed)
+{
+ gimple *stmt = RDGV_STMT (v);
+ tree lhs = gimple_assign_lhs (stmt);
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file, "original stmt:\t");
+ print_gimple_stmt (dump_file, stmt, 0, TDF_VOPS|TDF_MEMSYMS);
+ }
+ tree var_ssa = duplicate_ssa_name (lhs, stmt);
+ gimple_assign_set_lhs (stmt, var_ssa);
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file, "changed to:\t");
+ print_gimple_stmt (dump_file, stmt, 0, TDF_VOPS | TDF_MEMSYMS);
+ }
+ gimple_set_uid (gsi_stmt (gsi), -1);
+ tree vect_elt_type = TREE_TYPE (lhs);
+ tree array_type = build_array_type_nelts (vect_elt_type, array_extent);
+ tree array = create_tmp_var (array_type);
+ tree array_ssa = build4 (ARRAY_REF, vect_elt_type, array, iv, NULL, NULL);
+ tmp_array_vars->add (array_ssa);
+ gimple *store = gimple_build_assign (array_ssa, var_ssa);
+ tree new_vdef = make_ssa_name (gimple_vop (cfun), store);
+ gsi_insert_after (&gsi, store, GSI_NEW_STMT);
+ gimple_set_vdef (store, new_vdef);
+ transformed->safe_push (store);
+ gimple_set_uid (gsi_stmt (gsi), -1);
+ tree array_ssa2 = build4 (ARRAY_REF, vect_elt_type, array, iv, NULL, NULL);
+ tmp_array_vars->add (array_ssa2);
+ gimple *load = gimple_build_assign (lhs, array_ssa2);
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file, "insert stmt:\t");
+ print_gimple_stmt (dump_file, store, 0, TDF_VOPS|TDF_MEMSYMS);
+ fprintf (dump_file, " and stmt:\t");
+ print_gimple_stmt (dump_file, load, 0, TDF_VOPS|TDF_MEMSYMS);
+ }
+ gimple_set_vuse (load, new_vdef);
+ gsi_insert_after (&gsi, load, GSI_NEW_STMT);
+ gimple_set_uid (gsi_stmt (gsi), -1);
+}
+
+/* Set bitmap PRODUCERS based on vec TRANSFORMED. */
+
+void
+loop_distribution::build_producers (loop_p loop, bitmap producers,
+ vec<gimple *> &transformed)
+{
+ auto_vec<gimple *, 10> stmts;
+ stmts_from_loop (loop, &stmts);
+ int i = 0;
+ gimple *stmt = NULL;
+
+ FOR_EACH_VEC_ELT (stmts, i, stmt)
+ gimple_set_uid (stmt, i);
+ i = 0;
+ FOR_EACH_VEC_ELT (transformed, i, stmt)
+ bitmap_set_bit (producers, stmt->uid);
+}
+
+/* Transform stmt
+
+ A = FOO (ARG_1);
+
+ to
+
+ STMT_1: A1 = FOO (ARG_1);
+ STMT_2: X[I] = A1;
+ STMT_3: A = X[I];
+
+ Producer is STMT_2 who defines the temp array and consumer is
+ STMT_3 who uses the temp array. */
+
+void
+loop_distribution::do_insertion (loop_p loop, struct graph *flow_only_rdg,
+ tree iv, bitmap cut_points,
+ hash_set<tree> *tmp_array_vars,
+ bitmap producers)
+{
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "=== do insertion ===\n");
+
+ auto_vec<gimple *> transformed;
+
+ /* Execution times of loop. */
+ poly_uint64 array_extent
+ = tree_to_poly_uint64 (number_of_latch_executions (loop)) + 1;
+
+ basic_block *bbs = get_loop_body_in_custom_order (loop, this,
+ bb_top_order_cmp_r);
+
+ for (int i = 0; i < int (loop->num_nodes); i++)
+ {
+ basic_block bb = bbs[i];
+
+ /* Find all cut points in bb and transform them. */
+ for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
+ gsi_next (&gsi))
+ {
+ unsigned j = gimple_uid (gsi_stmt (gsi));
+ if (bitmap_bit_p (cut_points, j))
+ {
+ struct vertex *v = &(flow_only_rdg->vertices[j]);
+ build_temp_array (v, gsi, array_extent, iv, tmp_array_vars,
+ &transformed);
+ }
+ }
+ }
+ build_producers (loop, producers, transformed);
+ update_ssa (TODO_update_ssa);
+ free (bbs);
+}
+
+/* After temp array insertion, given stmts
+ STMT_1: M = FOO (ARG_1);
+ STMT_2: X[I] = M;
+ STMT_3: A = X[I];
+ STMT_2 is the producer, STMT_1 is its prev and STMT_3 is its next.
+ Replace M with A, and remove STMT_2 and STMT_3. */
+
+static void
+reset_gimple_assign (struct graph *flow_only_rdg, struct partition *partition,
+ gimple_stmt_iterator &gsi, int j)
+{
+ struct vertex *v = &(flow_only_rdg->vertices[j]);
+ gimple *stmt = RDGV_STMT (v);
+ gimple *prev = stmt->prev;
+ gimple *next = stmt->next;
+ tree n_lhs = gimple_assign_lhs (next);
+ gimple_assign_set_lhs (prev, n_lhs);
+ unlink_stmt_vdef (stmt);
+ if (partition)
+ bitmap_clear_bit (partition->stmts, gimple_uid (gsi_stmt (gsi)));
+ gsi_remove (&gsi, true);
+ release_defs (stmt);
+ if (partition)
+ bitmap_clear_bit (partition->stmts, gimple_uid (gsi_stmt (gsi)));
+ gsi_remove (&gsi, true);
+}
+
+void
+loop_distribution::remove_insertion (loop_p loop, struct graph *flow_only_rdg,
+ bitmap producers, struct partition *partition)
+{
+ basic_block *bbs = get_loop_body_in_custom_order (loop, this,
+ bb_top_order_cmp_r);
+ for (int i = 0; i < int (loop->num_nodes); i++)
+ {
+ basic_block bb = bbs[i];
+ for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
+ gsi_next (&gsi))
+ {
+ unsigned j = gimple_uid (gsi_stmt (gsi));
+ if (bitmap_bit_p (producers, j))
+ reset_gimple_assign (flow_only_rdg, partition, gsi, j);
+ }
+ }
+ update_ssa (TODO_update_ssa);
+ free (bbs);
+}
+
+/* Insert temp arrays if isomorphic computation exists. Temp arrays will be
+ regarded as SEED_STMTS for building partitions in succeeding processes. */
+
+bool
+loop_distribution::insert_temp_arrays (loop_p loop, vec<gimple *> seed_stmts,
+ hash_set<tree> *tmp_array_vars, bitmap producers)
+{
+ struct graph *flow_only_rdg = build_rdg (loop, NULL);
+ gcc_checking_assert (flow_only_rdg != NULL);
+ tree iv = get_iv_from_seed (flow_only_rdg, seed_stmts);
+ if (iv == NULL)
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "Loop %d no temp array insertion: failed to get"
+ " iteration variable.\n", loop->num);
+ free_rdg (flow_only_rdg);
+ return false;
+ }
+ auto_bitmap cut_points;
+ loop_vec_info vinfo = loop_vec_info_for_loop (loop);
+ unsigned n_cut_points = get_cut_points (flow_only_rdg, cut_points, vinfo);
+ delete vinfo;
+ loop->aux = NULL;
+ if (n_cut_points == 0)
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "Loop %d no temp array insertion: no cut points"
+ " found.\n", loop->num);
+ free_rdg (flow_only_rdg);
+ return false;
+ }
+ do_insertion (loop, flow_only_rdg, iv, cut_points, tmp_array_vars, producers);
+ if (dump_enabled_p ())
+ {
+ dump_user_location_t loc = find_loop_location (loop);
+ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, "Insertion done:"
+ " %d temp arrays inserted in Loop %d.\n",
+ n_cut_points, loop->num);
+ }
+ free_rdg (flow_only_rdg);
+ return true;
+}
+
+static bool find_seed_stmts_for_distribution (class loop *, vec<gimple *> *);
+
/* Distributes the code from LOOP in such a way that producer statements
are placed before consumer statements. Tries to separate only the
statements from STMTS into separate loops. Returns the number of
@@ -3814,6 +4311,34 @@ loop_distribution::distribute_loop (class loop *loop, vec<gimple *> stmts,
return 0;
}
+ /* Try to distribute LOOP if LOOP is simple enough and unable to vectorize.
+ If LOOP has grouped loads, recursively find isomorphic stmts and insert
+ temp arrays, rebuild RDG and call find_seed_stmts_for_distribution
+ to replace STMTS. */
+
+ hash_set<tree> tmp_array_vars;
+
+ /* STMTs that define those inserted TMP_ARRAYs. */
+ auto_bitmap producers;
+
+ /* New SEED_STMTS after insertion. */
+ auto_vec<gimple *> work_list;
+ bool insert_success = false;
+ if (may_insert_temp_arrays (loop, rdg, cd))
+ {
+ if (insert_temp_arrays (loop, stmts, &tmp_array_vars, producers))
+ {
+ if (find_seed_stmts_for_distribution (loop, &work_list))
+ {
+ insert_success = true;
+ stmts = work_list;
+ }
+ else
+ remove_insertion (loop, rdg, producers, NULL);
+ rebuild_rdg (loop, rdg, cd);
+ }
+ }
+
data_reference_p dref;
for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
dref->aux = (void *) (uintptr_t) i;
@@ -3894,7 +4419,7 @@ loop_distribution::distribute_loop (class loop *loop, vec<gimple *> stmts,
for (int j = i + 1;
partitions.iterate (j, &partition); ++j)
{
- if (share_memory_accesses (rdg, into, partition))
+ if (share_memory_accesses (rdg, into, partition, &tmp_array_vars))
{
partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
partitions.unordered_remove (j);
@@ -3944,7 +4469,7 @@ loop_distribution::distribute_loop (class loop *loop, vec<gimple *> stmts,
}
}
- finalize_partitions (loop, &partitions, &alias_ddrs);
+ finalize_partitions (loop, &partitions, &alias_ddrs, producers);
/* If there is a reduction in all partitions make sure the last one
is not classified for builtin code generation. */
@@ -3962,6 +4487,24 @@ loop_distribution::distribute_loop (class loop *loop, vec<gimple *> stmts,
}
nbp = partitions.length ();
+
+ /* If we have inserted TMP_ARRAYs but there is only one partition left in
+ the succeeding processes, remove those inserted TMP_ARRAYs back to the
+ original version. */
+
+ if (nbp == 1 && insert_success)
+ {
+ struct partition *partition = NULL;
+ partitions.iterate (0, &partition);
+ remove_insertion (loop, rdg, producers, partition);
+ if (dump_enabled_p ())
+ {
+ dump_user_location_t loc = find_loop_location (loop);
+ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, "Insertion removed:"
+ " unable to distribute loop %d.\n", loop->num);
+ }
+ }
+
if (nbp == 0
|| (nbp == 1 && !partition_builtin_p (partitions[0]))
|| (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
--
2.27.0.windows.1