906 lines
31 KiB
Diff
906 lines
31 KiB
Diff
diff -N -urp a/gcc/Makefile.in b/gcc/Makefile.in
|
|
--- a/gcc/Makefile.in 2018-11-07 11:37:24.615223860 +0800
|
|
+++ b/gcc/Makefile.in 2018-11-07 11:38:26.155223860 +0800
|
|
@@ -1292,6 +1292,7 @@ OBJS = \
|
|
gimple-iterator.o \
|
|
gimple-fold.o \
|
|
gimple-laddress.o \
|
|
+ gimple-loop-jam.o \
|
|
gimple-low.o \
|
|
gimple-pretty-print.o \
|
|
gimple-ssa-backprop.o \
|
|
diff -N -urp a/gcc/cfgloop.c b/gcc/cfgloop.c
|
|
--- a/gcc/cfgloop.c 2018-11-07 11:37:24.947223860 +0800
|
|
+++ b/gcc/cfgloop.c 2018-11-07 11:38:26.155223860 +0800
|
|
@@ -296,13 +296,25 @@ establish_preds (struct loop *loop, stru
|
|
|
|
/* Add LOOP to the loop hierarchy tree where FATHER is father of the
|
|
added loop. If LOOP has some children, take care of that their
|
|
- pred field will be initialized correctly. */
|
|
+ pred field will be initialized correctly. If AFTER is non-null
|
|
+ then it's expected it's a pointer into FATHERs inner sibling
|
|
+ list and LOOP is added behind AFTER, otherwise it's added in front
|
|
+ of FATHERs siblings. */
|
|
|
|
void
|
|
-flow_loop_tree_node_add (struct loop *father, struct loop *loop)
|
|
+flow_loop_tree_node_add (struct loop *father, struct loop *loop,
|
|
+ struct loop *after)
|
|
{
|
|
- loop->next = father->inner;
|
|
- father->inner = loop;
|
|
+ if (after)
|
|
+ {
|
|
+ loop->next = after->next;
|
|
+ after->next = loop;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ loop->next = father->inner;
|
|
+ father->inner = loop;
|
|
+ }
|
|
|
|
establish_preds (loop, father);
|
|
}
|
|
diff -N -urp a/gcc/cfgloop.h b/gcc/cfgloop.h
|
|
--- a/gcc/cfgloop.h 2018-11-07 11:37:24.331223860 +0800
|
|
+++ b/gcc/cfgloop.h 2018-11-07 11:38:26.155223860 +0800
|
|
@@ -324,7 +324,8 @@ void record_loop_exits (void);
|
|
void rescan_loop_exit (edge, bool, bool);
|
|
|
|
/* Loop data structure manipulation/querying. */
|
|
-extern void flow_loop_tree_node_add (struct loop *, struct loop *);
|
|
+extern void flow_loop_tree_node_add (struct loop *, struct loop *,
|
|
+ struct loop * = NULL);
|
|
extern void flow_loop_tree_node_remove (struct loop *);
|
|
extern bool flow_loop_nested_p (const struct loop *, const struct loop *);
|
|
extern bool flow_bb_inside_loop_p (const struct loop *, const_basic_block);
|
|
diff -N -urp a/gcc/cfgloopmanip.c b/gcc/cfgloopmanip.c
|
|
--- a/gcc/cfgloopmanip.c 2018-11-07 11:37:24.847223860 +0800
|
|
+++ b/gcc/cfgloopmanip.c 2018-11-07 11:38:26.155223860 +0800
|
|
@@ -1026,9 +1026,11 @@ copy_loop_info (struct loop *loop, struc
|
|
}
|
|
|
|
/* Copies copy of LOOP as subloop of TARGET loop, placing newly
|
|
- created loop into loops structure. */
|
|
+ created loop into loops structure. If AFTER is non-null
|
|
+ the new loop is added at AFTER->next, otherwise in front of TARGETs
|
|
+ sibling list. */
|
|
struct loop *
|
|
-duplicate_loop (struct loop *loop, struct loop *target)
|
|
+duplicate_loop (struct loop *loop, struct loop *target, struct loop *after)
|
|
{
|
|
struct loop *cloop;
|
|
cloop = alloc_loop ();
|
|
@@ -1040,36 +1042,46 @@ duplicate_loop (struct loop *loop, struc
|
|
set_loop_copy (loop, cloop);
|
|
|
|
/* Add it to target. */
|
|
- flow_loop_tree_node_add (target, cloop);
|
|
+ flow_loop_tree_node_add (target, cloop, after);
|
|
|
|
return cloop;
|
|
}
|
|
|
|
/* Copies structure of subloops of LOOP into TARGET loop, placing
|
|
- newly created loops into loop tree. */
|
|
+ newly created loops into loop tree at the end of TARGETs sibling
|
|
+ list in the original order. */
|
|
void
|
|
duplicate_subloops (struct loop *loop, struct loop *target)
|
|
{
|
|
- struct loop *aloop, *cloop;
|
|
+ struct loop *aloop, *cloop, *tail;
|
|
|
|
+ for (tail = target->inner; tail && tail->next; tail = tail->next)
|
|
+ ;
|
|
for (aloop = loop->inner; aloop; aloop = aloop->next)
|
|
{
|
|
- cloop = duplicate_loop (aloop, target);
|
|
+ cloop = duplicate_loop (aloop, target, tail);
|
|
+ tail = cloop;
|
|
+ gcc_assert (!tail->next);
|
|
duplicate_subloops (aloop, cloop);
|
|
}
|
|
}
|
|
|
|
/* Copies structure of subloops of N loops, stored in array COPIED_LOOPS,
|
|
- into TARGET loop, placing newly created loops into loop tree. */
|
|
+ into TARGET loop, placing newly created loops into loop tree adding
|
|
+ them to TARGETs sibling list at the end in order. */
|
|
static void
|
|
copy_loops_to (struct loop **copied_loops, int n, struct loop *target)
|
|
{
|
|
- struct loop *aloop;
|
|
+ struct loop *aloop, *tail;
|
|
int i;
|
|
|
|
+ for (tail = target->inner; tail && tail->next; tail = tail->next)
|
|
+ ;
|
|
for (i = 0; i < n; i++)
|
|
{
|
|
- aloop = duplicate_loop (copied_loops[i], target);
|
|
+ aloop = duplicate_loop (copied_loops[i], target, tail);
|
|
+ tail = aloop;
|
|
+ gcc_assert (!tail->next);
|
|
duplicate_subloops (copied_loops[i], aloop);
|
|
}
|
|
}
|
|
@@ -1133,14 +1145,15 @@ set_zero_probability (edge e)
|
|
}
|
|
|
|
/* Duplicates body of LOOP to given edge E NDUPL times. Takes care of updating
|
|
- loop structure and dominators. E's destination must be LOOP header for
|
|
- this to work, i.e. it must be entry or latch edge of this loop; these are
|
|
- unique, as the loops must have preheaders for this function to work
|
|
- correctly (in case E is latch, the function unrolls the loop, if E is entry
|
|
- edge, it peels the loop). Store edges created by copying ORIG edge from
|
|
- copies corresponding to set bits in WONT_EXIT bitmap (bit 0 corresponds to
|
|
- original LOOP body, the other copies are numbered in order given by control
|
|
- flow through them) into TO_REMOVE array. Returns false if duplication is
|
|
+ loop structure and dominators (order of inner subloops is retained).
|
|
+ E's destination must be LOOP header for this to work, i.e. it must be entry
|
|
+ or latch edge of this loop; these are unique, as the loops must have
|
|
+ preheaders for this function to work correctly (in case E is latch, the
|
|
+ function unrolls the loop, if E is entry edge, it peels the loop). Store
|
|
+ edges created by copying ORIG edge from copies corresponding to set bits in
|
|
+ WONT_EXIT bitmap (bit 0 corresponds to original LOOP body, the other copies
|
|
+ are numbered in order given by control flow through them) into TO_REMOVE
|
|
+ array. Returns false if duplication is
|
|
impossible. */
|
|
|
|
bool
|
|
diff -N -urp a/gcc/cfgloopmanip.h b/gcc/cfgloopmanip.h
|
|
--- a/gcc/cfgloopmanip.h 2018-11-07 11:37:24.939223860 +0800
|
|
+++ b/gcc/cfgloopmanip.h 2018-11-07 11:38:26.155223860 +0800
|
|
@@ -47,7 +47,8 @@ extern struct loop *loopify (edge, edge,
|
|
unsigned, unsigned);
|
|
extern void unloop (struct loop *, bool *, bitmap);
|
|
extern void copy_loop_info (struct loop *loop, struct loop *target);
|
|
-extern struct loop * duplicate_loop (struct loop *, struct loop *);
|
|
+extern struct loop * duplicate_loop (struct loop *, struct loop *,
|
|
+ struct loop * = NULL);
|
|
extern void duplicate_subloops (struct loop *, struct loop *);
|
|
extern bool can_duplicate_loop_p (const struct loop *loop);
|
|
extern bool duplicate_loop_to_header_edge (struct loop *, edge,
|
|
diff -N -urp a/gcc/common.opt b/gcc/common.opt
|
|
--- a/gcc/common.opt 2018-11-07 11:37:24.859223860 +0800
|
|
+++ b/gcc/common.opt 2018-11-07 11:38:26.159223860 +0800
|
|
@@ -1496,8 +1496,8 @@ Common Alias(floop-nest-optimize)
|
|
Enable loop nest transforms. Same as -floop-nest-optimize.
|
|
|
|
floop-unroll-and-jam
|
|
-Common Alias(floop-nest-optimize)
|
|
-Enable loop nest transforms. Same as -floop-nest-optimize.
|
|
+Common Report Var(flag_unroll_jam) Optimization
|
|
+Perform unroll-and-jam on loops.
|
|
|
|
fgnu-tm
|
|
Common Report Var(flag_tm)
|
|
diff -N -urp a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
|
|
--- a/gcc/doc/invoke.texi 2018-11-07 11:37:24.915223860 +0800
|
|
+++ b/gcc/doc/invoke.texi 2018-11-07 11:39:49.031223860 +0800
|
|
@@ -7120,7 +7120,8 @@ Optimize yet more. @option{-O3} turns o
|
|
by @option{-O2} and also turns on the @option{-finline-functions},
|
|
@option{-funswitch-loops}, @option{-fpredictive-commoning},
|
|
@option{-fgcse-after-reload}, @option{-ftree-loop-vectorize},
|
|
-@option{-ftree-loop-distribute-patterns}, @option{-fsplit-paths}
|
|
+@option{-ftree-loop-distribute-patterns}, @option{-fsplit-paths},
|
|
+@option{-floop-unroll-and-jam},
|
|
@option{-ftree-slp-vectorize}, @option{-fvect-cost-model},
|
|
@option{-ftree-partial-pre}, @option{-fpeel-loops}
|
|
and @option{-fipa-cp-clone} options.
|
|
@@ -8226,12 +8227,10 @@ at @option{-O} and higher.
|
|
@itemx -floop-interchange
|
|
@itemx -floop-strip-mine
|
|
@itemx -floop-block
|
|
-@itemx -floop-unroll-and-jam
|
|
@opindex ftree-loop-linear
|
|
@opindex floop-interchange
|
|
@opindex floop-strip-mine
|
|
@opindex floop-block
|
|
-@opindex floop-unroll-and-jam
|
|
Perform loop nest optimizations. Same as
|
|
@option{-floop-nest-optimize}. To use this code transformation, GCC has
|
|
to be configured with @option{--with-isl} to enable the Graphite loop
|
|
@@ -8323,6 +8322,12 @@ ENDDO
|
|
@end smallexample
|
|
and the initialization loop is transformed into a call to memset zero.
|
|
|
|
+@item -floop-unroll-and-jam
|
|
+@opindex floop-unroll-and-jam
|
|
+Apply unroll and jam transformations on feasible loops. In a loop
|
|
+nest this unrolls the outer loop by some factor and fuses the resulting
|
|
+multiple inner loops. This flag is enabled by default at @option{-O3}.
|
|
+
|
|
@item -ftree-loop-im
|
|
@opindex ftree-loop-im
|
|
Perform loop invariant motion on trees. This pass moves only invariants that
|
|
@@ -10353,13 +10358,13 @@ loop in the loop nest by a given number
|
|
length can be changed using the @option{loop-block-tile-size}
|
|
parameter. The default value is 51 iterations.
|
|
|
|
-@item loop-unroll-jam-size
|
|
-Specify the unroll factor for the @option{-floop-unroll-and-jam} option. The
|
|
-default value is 4.
|
|
-
|
|
-@item loop-unroll-jam-depth
|
|
-Specify the dimension to be unrolled (counting from the most inner loop)
|
|
-for the @option{-floop-unroll-and-jam}. The default value is 2.
|
|
+@item unroll-jam-min-percent
|
|
+The minimum percentage of memory references that must be optimized
|
|
+away for the unroll-and-jam transformation to be considered profitable.
|
|
+
|
|
+@item unroll-jam-max-unroll
|
|
+The maximum number of times the outer loop should be unrolled by
|
|
+the unroll-and-jam transformation.
|
|
|
|
@item ipa-cp-value-list-size
|
|
IPA-CP attempts to track all possible values and types passed to a function's
|
|
diff -N -urp a/gcc/gimple-loop-jam.c b/gcc/gimple-loop-jam.c
|
|
--- a/gcc/gimple-loop-jam.c 1970-01-01 08:00:00.000000000 +0800
|
|
+++ b/gcc/gimple-loop-jam.c 2018-11-07 11:38:26.167223860 +0800
|
|
@@ -0,0 +1,598 @@
|
|
+/* Loop unroll-and-jam.
|
|
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
|
|
+
|
|
+This file is part of GCC.
|
|
+
|
|
+GCC is free software; you can redistribute it and/or modify it
|
|
+under the terms of the GNU General Public License as published by the
|
|
+Free Software Foundation; either version 3, or (at your option) any
|
|
+later version.
|
|
+
|
|
+GCC is distributed in the hope that it will be useful, but WITHOUT
|
|
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
+for more details.
|
|
+
|
|
+You should have received a copy of the GNU General Public License
|
|
+along with GCC; see the file COPYING3. If not see
|
|
+<http://www.gnu.org/licenses/>. */
|
|
+
|
|
+#include "config.h"
|
|
+#include "system.h"
|
|
+#include "coretypes.h"
|
|
+#include "params.h"
|
|
+#include "tree-pass.h"
|
|
+#include "backend.h"
|
|
+#include "tree.h"
|
|
+#include "gimple.h"
|
|
+#include "ssa.h"
|
|
+#include "fold-const.h"
|
|
+#include "tree-cfg.h"
|
|
+#include "tree-ssa.h"
|
|
+#include "tree-ssa-loop-niter.h"
|
|
+#include "tree-ssa-loop.h"
|
|
+#include "tree-ssa-loop-manip.h"
|
|
+#include "cfgloop.h"
|
|
+#include "tree-scalar-evolution.h"
|
|
+#include "gimple-iterator.h"
|
|
+#include "cfghooks.h"
|
|
+#include "tree-data-ref.h"
|
|
+#include "tree-ssa-loop-ivopts.h"
|
|
+#include "tree-vectorizer.h"
|
|
+
|
|
+/* Unroll and Jam transformation
|
|
+
|
|
+ This is a combination of two transformations, where the second
|
|
+ is not always valid. It's applicable if a loop nest has redundancies
|
|
+ over the iterations of an outer loop while not having that with
|
|
+ an inner loop.
|
|
+
|
|
+ Given this nest:
|
|
+ for (i) {
|
|
+ for (j) {
|
|
+ B (i,j)
|
|
+ }
|
|
+ }
|
|
+
|
|
+ first unroll:
|
|
+ for (i by 2) {
|
|
+ for (j) {
|
|
+ B (i,j)
|
|
+ }
|
|
+ for (j) {
|
|
+ B (i+1,j)
|
|
+ }
|
|
+ }
|
|
+
|
|
+ then fuse the two adjacent inner loops resulting from that:
|
|
+ for (i by 2) {
|
|
+ for (j) {
|
|
+ B (i,j)
|
|
+ B (i+1,j)
|
|
+ }
|
|
+ }
|
|
+
|
|
+ As the order of evaluations of the body B changes this is valid
|
|
+ only in certain situations: all distance vectors need to be forward.
|
|
+ Additionally if there are multiple induction variables than just
|
|
+ a counting control IV (j above) we can also deal with some situations.
|
|
+
|
|
+ The validity is checked by unroll_jam_possible_p, and the data-dep
|
|
+ testing below.
|
|
+
|
|
+ A trivial example where the fusion is wrong would be when
|
|
+ B (i,j) == x[j-1] = x[j];
|
|
+ for (i by 2) {
|
|
+ for (j) {
|
|
+ x[j-1] = x[j];
|
|
+ }
|
|
+ for (j) {
|
|
+ x[j-1] = x[j];
|
|
+ }
|
|
+ } effect: move content to front by two elements
|
|
+ -->
|
|
+ for (i by 2) {
|
|
+ for (j) {
|
|
+ x[j-1] = x[j];
|
|
+ x[j-1] = x[j];
|
|
+ }
|
|
+ } effect: move content to front by one element
|
|
+*/
|
|
+
|
|
+/* Modify the loop tree for the fact that all code once belonging
|
|
+ to the OLD loop or the outer loop of OLD now is inside LOOP. */
|
|
+
|
|
+static void
|
|
+merge_loop_tree (struct loop *loop, struct loop *old)
|
|
+{
|
|
+ basic_block *bbs;
|
|
+ int i, n;
|
|
+ struct loop *subloop;
|
|
+ edge e;
|
|
+ edge_iterator ei;
|
|
+
|
|
+ /* Find its nodes. */
|
|
+ bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
|
|
+ n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun));
|
|
+
|
|
+ for (i = 0; i < n; i++)
|
|
+ {
|
|
+ /* If the block was direct child of OLD loop it's now part
|
|
+ of LOOP. If it was outside OLD, then it moved into LOOP
|
|
+ as well. This avoids changing the loop father for BBs
|
|
+ in inner loops of OLD. */
|
|
+ if (bbs[i]->loop_father == old
|
|
+ || loop_depth (bbs[i]->loop_father) < loop_depth (old))
|
|
+ {
|
|
+ remove_bb_from_loops (bbs[i]);
|
|
+ add_bb_to_loop (bbs[i], loop);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /* If we find a direct subloop of OLD, move it to LOOP. */
|
|
+ subloop = bbs[i]->loop_father;
|
|
+ if (loop_outer (subloop) == old && subloop->header == bbs[i])
|
|
+ {
|
|
+ flow_loop_tree_node_remove (subloop);
|
|
+ flow_loop_tree_node_add (loop, subloop);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Update the information about loop exit edges. */
|
|
+ for (i = 0; i < n; i++)
|
|
+ {
|
|
+ FOR_EACH_EDGE (e, ei, bbs[i]->succs)
|
|
+ {
|
|
+ rescan_loop_exit (e, false, false);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ loop->num_nodes = n;
|
|
+
|
|
+ free (bbs);
|
|
+}
|
|
+
|
|
+/* BB is part of the outer loop of an unroll-and-jam situation.
|
|
+ Check if any statements therein would prevent the transformation. */
|
|
+
|
|
+static bool
|
|
+bb_prevents_fusion_p (basic_block bb)
|
|
+{
|
|
+ gimple_stmt_iterator gsi;
|
|
+ /* BB is duplicated by outer unrolling and then all N-1 first copies
|
|
+ move into the body of the fused inner loop. If BB exits the outer loop
|
|
+ the last copy still does so, and the first N-1 copies are cancelled
|
|
+ by loop unrolling, so also after fusion it's the exit block.
|
|
+ But there might be other reasons that prevent fusion:
|
|
+ * stores or unknown side-effects prevent fusion
|
|
+ * loads don't
|
|
+ * computations into SSA names: these aren't problematic. Their
|
|
+ result will be unused on the exit edges of the first N-1 copies
|
|
+ (those aren't taken after unrolling). If they are used on the
|
|
+ other edge (the one leading to the outer latch block) they are
|
|
+ loop-carried (on the outer loop) and the Nth copy of BB will
|
|
+ compute them again (i.e. the first N-1 copies will be dead). */
|
|
+ for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
|
|
+ {
|
|
+ gimple *g = gsi_stmt (gsi);
|
|
+ if (gimple_vdef (g) || gimple_has_side_effects (g))
|
|
+ return true;
|
|
+ }
|
|
+ return false;
|
|
+}
|
|
+
|
|
+/* Given an inner loop LOOP (of some OUTER loop) determine if
|
|
+ we can safely fuse copies of it (generated by outer unrolling).
|
|
+ If so return true, otherwise return false. */
|
|
+
|
|
+static bool
|
|
+unroll_jam_possible_p (struct loop *outer, struct loop *loop)
|
|
+{
|
|
+ basic_block *bbs;
|
|
+ int i, n;
|
|
+ struct tree_niter_desc niter;
|
|
+
|
|
+ /* When fusing the loops we skip the latch block
|
|
+ of the first one, so it mustn't have any effects to
|
|
+ preserve. */
|
|
+ if (!empty_block_p (loop->latch))
|
|
+ return false;
|
|
+
|
|
+ if (!single_exit (loop))
|
|
+ return false;
|
|
+
|
|
+ /* We need a perfect nest. Quick check for adjacent inner loops. */
|
|
+ if (outer->inner != loop || loop->next)
|
|
+ return false;
|
|
+
|
|
+ /* Prevent head-controlled inner loops, that we usually have.
|
|
+ The guard block would need to be accepted
|
|
+ (invariant condition either entering or skipping the loop),
|
|
+ without also accepting arbitrary control flow. When unswitching
|
|
+ ran before us (as with -O3) this won't be a problem because its
|
|
+ outer loop unswitching will have moved out the invariant condition.
|
|
+
|
|
+ If we do that we need to extend fuse_loops () to cope with this
|
|
+ by threading through the (still invariant) copied condition
|
|
+ between the two loop copies. */
|
|
+ if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header))
|
|
+ return false;
|
|
+
|
|
+ /* The number of iterations of the inner loop must be loop invariant
|
|
+ with respect to the outer loop. */
|
|
+ if (!number_of_iterations_exit (loop, single_exit (loop), &niter,
|
|
+ false, true)
|
|
+ || niter.cmp == ERROR_MARK
|
|
+ || !integer_zerop (niter.may_be_zero)
|
|
+ || !expr_invariant_in_loop_p (outer, niter.niter))
|
|
+ return false;
|
|
+
|
|
+ /* If the inner loop produces any values that are used inside the
|
|
+ outer loop (except the virtual op) then it can flow
|
|
+ back (perhaps indirectly) into the inner loop. This prevents
|
|
+ fusion: without fusion the value at the last iteration is used,
|
|
+ with fusion the value after the initial iteration is used.
|
|
+
|
|
+ If all uses are outside the outer loop this doesn't prevent fusion;
|
|
+ the value of the last iteration is still used (and the values from
|
|
+ all intermediate iterations are dead). */
|
|
+ gphi_iterator psi;
|
|
+ for (psi = gsi_start_phis (single_exit (loop)->dest);
|
|
+ !gsi_end_p (psi); gsi_next (&psi))
|
|
+ {
|
|
+ imm_use_iterator imm_iter;
|
|
+ use_operand_p use_p;
|
|
+ tree op = gimple_phi_result (psi.phi ());
|
|
+ if (virtual_operand_p (op))
|
|
+ continue;
|
|
+ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op)
|
|
+ {
|
|
+ gimple *use_stmt = USE_STMT (use_p);
|
|
+ if (!is_gimple_debug (use_stmt)
|
|
+ && flow_bb_inside_loop_p (outer, gimple_bb (use_stmt)))
|
|
+ return false;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* And check blocks belonging to just outer loop. */
|
|
+ bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
|
|
+ n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun));
|
|
+
|
|
+ for (i = 0; i < n; i++)
|
|
+ if (bbs[i]->loop_father == outer && bb_prevents_fusion_p (bbs[i]))
|
|
+ break;
|
|
+ free (bbs);
|
|
+ if (i != n)
|
|
+ return false;
|
|
+
|
|
+ /* For now we can safely fuse copies of LOOP only if all
|
|
+ loop carried variables are inductions (or the virtual op).
|
|
+
|
|
+ We could handle reductions as well (the initial value in the second
|
|
+ body would be the after-iter value of the first body) if it's over
|
|
+ an associative and commutative operation. We wouldn't
|
|
+ be able to handle unknown cycles. */
|
|
+ for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi))
|
|
+ {
|
|
+ affine_iv iv;
|
|
+ tree op = gimple_phi_result (psi.phi ());
|
|
+
|
|
+ if (virtual_operand_p (op))
|
|
+ continue;
|
|
+ if (!simple_iv (loop, loop, op, &iv, true))
|
|
+ return false;
|
|
+ /* The inductions must be regular, loop invariant step and initial
|
|
+ value. */
|
|
+ if (!expr_invariant_in_loop_p (outer, iv.step)
|
|
+ || !expr_invariant_in_loop_p (outer, iv.base))
|
|
+ return false;
|
|
+ /* XXX With more effort we could also be able to deal with inductions
|
|
+ where the initial value is loop variant but a simple IV in the
|
|
+ outer loop. The initial value for the second body would be
|
|
+ the original initial value plus iv.base.step. The next value
|
|
+ for the fused loop would be the original next value of the first
|
|
+ copy, _not_ the next value of the second body. */
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+/* Fuse LOOP with all further neighbors. The loops are expected to
|
|
+ be in appropriate form. */
|
|
+
|
|
+static void
|
|
+fuse_loops (struct loop *loop)
|
|
+{
|
|
+ struct loop *next = loop->next;
|
|
+
|
|
+ while (next)
|
|
+ {
|
|
+ edge e;
|
|
+
|
|
+ remove_branch (single_pred_edge (loop->latch));
|
|
+ /* Make delete_basic_block not fiddle with the loop structure. */
|
|
+ basic_block oldlatch = loop->latch;
|
|
+ loop->latch = NULL;
|
|
+ delete_basic_block (oldlatch);
|
|
+ e = redirect_edge_and_branch (loop_latch_edge (next),
|
|
+ loop->header);
|
|
+ loop->latch = e->src;
|
|
+ flush_pending_stmts (e);
|
|
+
|
|
+ gcc_assert (EDGE_COUNT (next->header->preds) == 1);
|
|
+
|
|
+ /* The PHI nodes of the second body (single-argument now)
|
|
+ need adjustments to use the right values: either directly
|
|
+ the value of the corresponding PHI in the first copy or
|
|
+ the one leaving the first body which unrolling did for us.
|
|
+
|
|
+ See also unroll_jam_possible_p () for further possibilities. */
|
|
+ gphi_iterator psi_first, psi_second;
|
|
+ e = single_pred_edge (next->header);
|
|
+ for (psi_first = gsi_start_phis (loop->header),
|
|
+ psi_second = gsi_start_phis (next->header);
|
|
+ !gsi_end_p (psi_first);
|
|
+ gsi_next (&psi_first), gsi_next (&psi_second))
|
|
+ {
|
|
+ gphi *phi_first = psi_first.phi ();
|
|
+ gphi *phi_second = psi_second.phi ();
|
|
+ tree firstop = gimple_phi_result (phi_first);
|
|
+ /* The virtual operand is correct already as it's
|
|
+ always live at exit, hence has a LCSSA node and outer
|
|
+ loop unrolling updated SSA form. */
|
|
+ if (virtual_operand_p (firstop))
|
|
+ continue;
|
|
+
|
|
+ /* Due to unroll_jam_possible_p () we know that this is
|
|
+ an induction. The second body goes over the same
|
|
+ iteration space. */
|
|
+ add_phi_arg (phi_second, firstop, e,
|
|
+ gimple_location (phi_first));
|
|
+ }
|
|
+ gcc_assert (gsi_end_p (psi_second));
|
|
+
|
|
+ merge_loop_tree (loop, next);
|
|
+ gcc_assert (!next->num_nodes);
|
|
+ struct loop *ln = next->next;
|
|
+ delete_loop (next);
|
|
+ next = ln;
|
|
+ }
|
|
+ rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
|
|
+}
|
|
+
|
|
+/* Returns true if the distance in DDR can be determined and adjusts
|
|
+ the unroll factor in *UNROLL to make unrolling valid for that distance.
|
|
+ Otherwise return false.
|
|
+
|
|
+ If this data dep can lead to a removed memory reference, increment
|
|
+ *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
|
|
+ for this to happen. */
|
|
+
|
|
+static bool
|
|
+adjust_unroll_factor (struct data_dependence_relation *ddr,
|
|
+ unsigned *unroll, unsigned *profit_unroll,
|
|
+ unsigned *removed)
|
|
+{
|
|
+ bool ret = false;
|
|
+ if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
|
|
+ {
|
|
+ if (DDR_NUM_DIST_VECTS (ddr) == 0)
|
|
+ return false;
|
|
+ unsigned i;
|
|
+ lambda_vector dist_v;
|
|
+ FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
|
|
+ {
|
|
+ /* A distance (a,b) is at worst transformed into (a/N,b) by the
|
|
+ unrolling (factor N), so the transformation is valid if
|
|
+ a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll
|
|
+ factor needs to be limited so that the first condition holds.
|
|
+ That may limit the factor down to zero in the worst case. */
|
|
+ int dist = dist_v[0];
|
|
+ if (dist < 0)
|
|
+ gcc_unreachable ();
|
|
+ else if ((unsigned)dist >= *unroll)
|
|
+ ;
|
|
+ else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
|
|
+ || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
|
|
+ && dist > 0))
|
|
+ ;
|
|
+ else
|
|
+ *unroll = dist;
|
|
+
|
|
+ /* With a distance (a,0) it's always profitable to unroll-and-jam
|
|
+ (by a+1), because one memory reference will go away. With
|
|
+ (a,b) and b != 0 that's less clear. We will increase the
|
|
+ number of streams without lowering the number of mem refs.
|
|
+ So for now only handle the first situation. */
|
|
+ if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
|
|
+ {
|
|
+ *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1);
|
|
+ (*removed)++;
|
|
+ }
|
|
+
|
|
+ ret = true;
|
|
+ }
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* Main entry point for the unroll-and-jam transformation
|
|
+ described above. */
|
|
+
|
|
+static unsigned int
|
|
+tree_loop_unroll_and_jam (void)
|
|
+{
|
|
+ struct loop *loop;
|
|
+ bool changed = false;
|
|
+
|
|
+ gcc_assert (scev_initialized_p ());
|
|
+
|
|
+ /* Go through all innermost loops. */
|
|
+ FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
|
|
+ {
|
|
+ struct loop *outer = loop_outer (loop);
|
|
+
|
|
+ if (loop_depth (loop) < 2
|
|
+ || optimize_loop_nest_for_size_p (outer))
|
|
+ continue;
|
|
+
|
|
+ if (!unroll_jam_possible_p (outer, loop))
|
|
+ continue;
|
|
+
|
|
+ vec<data_reference_p> datarefs;
|
|
+ vec<ddr_p> dependences;
|
|
+ unsigned unroll_factor, profit_unroll, removed;
|
|
+ struct tree_niter_desc desc;
|
|
+ bool unroll = false;
|
|
+
|
|
+ auto_vec<loop_p, 3> loop_nest;
|
|
+ dependences.create (10);
|
|
+ datarefs.create (10);
|
|
+ if (!compute_data_dependences_for_loop (outer, true, &loop_nest,
|
|
+ &datarefs, &dependences))
|
|
+ {
|
|
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
|
+ fprintf (dump_file, "Cannot analyze data dependencies\n");
|
|
+ free_data_refs (datarefs);
|
|
+ free_dependence_relations (dependences);
|
|
+ return false;
|
|
+ }
|
|
+ if (!datarefs.length ())
|
|
+ continue;
|
|
+
|
|
+ if (dump_file && (dump_flags & TDF_DETAILS))
|
|
+ dump_data_dependence_relations (dump_file, dependences);
|
|
+
|
|
+ unroll_factor = (unsigned)-1;
|
|
+ profit_unroll = 1;
|
|
+ removed = 0;
|
|
+
|
|
+ /* Check all dependencies. */
|
|
+ unsigned i;
|
|
+ struct data_dependence_relation *ddr;
|
|
+ FOR_EACH_VEC_ELT (dependences, i, ddr)
|
|
+ {
|
|
+ struct data_reference *dra, *drb;
|
|
+
|
|
+ /* If the refs are independend there's nothing to do. */
|
|
+ if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
|
|
+ continue;
|
|
+ dra = DDR_A (ddr);
|
|
+ drb = DDR_B (ddr);
|
|
+ /* Nothing interesting for the self dependencies. */
|
|
+ if (dra == drb)
|
|
+ continue;
|
|
+
|
|
+ /* Now check the distance vector, for determining a sensible
|
|
+ outer unroll factor, and for validity of merging the inner
|
|
+ loop copies. */
|
|
+ if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll,
|
|
+ &removed))
|
|
+ {
|
|
+ /* Couldn't get the distance vector. For two reads that's
|
|
+ harmless (we assume we should unroll). For at least
|
|
+ one write this means we can't check the dependence direction
|
|
+ and hence can't determine safety. */
|
|
+
|
|
+ if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb))
|
|
+ {
|
|
+ unroll_factor = 0;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* We regard a user-specified minimum percentage of zero as a request
|
|
+ to ignore all profitability concerns and apply the transformation
|
|
+ always. */
|
|
+ if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
|
|
+ profit_unroll = 2;
|
|
+ else if (removed * 100 / datarefs.length ()
|
|
+ < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
|
|
+ profit_unroll = 1;
|
|
+ if (unroll_factor > profit_unroll)
|
|
+ unroll_factor = profit_unroll;
|
|
+ if (unroll_factor > (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL))
|
|
+ unroll_factor = PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL);
|
|
+ unroll = (unroll_factor > 1
|
|
+ && can_unroll_loop_p (outer, unroll_factor, &desc));
|
|
+
|
|
+ if (unroll)
|
|
+ {
|
|
+ if (dump_enabled_p ())
|
|
+ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS,
|
|
+ find_loop_location (outer),
|
|
+ "applying unroll and jam with factor %d\n",
|
|
+ unroll_factor);
|
|
+ initialize_original_copy_tables ();
|
|
+ tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer),
|
|
+ &desc);
|
|
+ free_original_copy_tables ();
|
|
+ fuse_loops (outer->inner);
|
|
+ changed = true;
|
|
+ }
|
|
+
|
|
+ loop_nest.release ();
|
|
+ free_dependence_relations (dependences);
|
|
+ free_data_refs (datarefs);
|
|
+ }
|
|
+
|
|
+ if (changed)
|
|
+ {
|
|
+ scev_reset ();
|
|
+ free_dominance_info (CDI_DOMINATORS);
|
|
+ return TODO_cleanup_cfg;
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* Pass boilerplate. */
|
|
+
|
|
+namespace {
|
|
+
|
|
+const pass_data pass_data_loop_jam =
|
|
+{
|
|
+ GIMPLE_PASS, /* type. */
|
|
+ "unrolljam", /* name. */
|
|
+ OPTGROUP_LOOP, /* optinfo_flags. */
|
|
+ TV_LOOP_JAM, /* tv_id. */
|
|
+ PROP_cfg, /* properties_required. */
|
|
+ 0, /* properties_provided. */
|
|
+ 0, /* properties_destroyed. */
|
|
+ 0, /* todo_flags_start. */
|
|
+ 0, /* todo_flags_finish. */
|
|
+};
|
|
+
|
|
+class pass_loop_jam : public gimple_opt_pass
|
|
+{
|
|
+public:
|
|
+ pass_loop_jam (gcc::context *ctxt)
|
|
+ : gimple_opt_pass (pass_data_loop_jam, ctxt)
|
|
+ {}
|
|
+
|
|
+ /* opt_pass methods: */
|
|
+ virtual bool gate (function *)
|
|
+ {
|
|
+ return flag_unroll_jam != 0;
|
|
+ }
|
|
+ virtual unsigned int execute (function *);
|
|
+
|
|
+};
|
|
+
|
|
+unsigned int
|
|
+pass_loop_jam::execute (function *fun)
|
|
+{
|
|
+ if (number_of_loops (fun) <= 1)
|
|
+ return 0;
|
|
+
|
|
+ return tree_loop_unroll_and_jam ();
|
|
+}
|
|
+
|
|
+}
|
|
+
|
|
+gimple_opt_pass *
|
|
+make_pass_loop_jam (gcc::context *ctxt)
|
|
+{
|
|
+ return new pass_loop_jam (ctxt);
|
|
+}
|
|
+
|
|
diff -N -urp a/gcc/opts.c b/gcc/opts.c
|
|
--- a/gcc/opts.c 2018-11-07 11:37:24.891223860 +0800
|
|
+++ b/gcc/opts.c 2018-11-07 11:38:26.171223860 +0800
|
|
@@ -534,6 +534,7 @@ static const struct default_options defa
|
|
{ OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_finline_functions_called_once, NULL, 1 },
|
|
{ OPT_LEVELS_3_PLUS, OPT_fsplit_loops, NULL, 1 },
|
|
{ OPT_LEVELS_3_PLUS, OPT_funswitch_loops, NULL, 1 },
|
|
+ { OPT_LEVELS_3_PLUS, OPT_floop_unroll_and_jam, NULL, 1 },
|
|
{ OPT_LEVELS_3_PLUS, OPT_fgcse_after_reload, NULL, 1 },
|
|
{ OPT_LEVELS_3_PLUS, OPT_ftree_loop_vectorize, NULL, 1 },
|
|
{ OPT_LEVELS_3_PLUS, OPT_ftree_slp_vectorize, NULL, 1 },
|
|
diff -N -urp a/gcc/params.def b/gcc/params.def
|
|
--- a/gcc/params.def 2018-11-07 11:37:27.543223860 +0800
|
|
+++ b/gcc/params.def 2018-11-07 11:38:26.171223860 +0800
|
|
@@ -1280,6 +1280,16 @@ DEFPARAM (PARAM_VECT_EPILOGUES_NOMASK,
|
|
"Enable loop epilogue vectorization using smaller vector size.",
|
|
0, 0, 1)
|
|
|
|
+DEFPARAM (PARAM_UNROLL_JAM_MIN_PERCENT,
|
|
+ "unroll-jam-min-percent",
|
|
+ "Minimum percentage of memrefs that must go away for unroll-and-jam to be considered profitable.",
|
|
+ 1, 0, 100)
|
|
+
|
|
+DEFPARAM (PARAM_UNROLL_JAM_MAX_UNROLL,
|
|
+ "unroll-jam-max-unroll",
|
|
+ "Maximum unroll factor for the unroll-and-jam transformation.",
|
|
+ 4, 0, 0)
|
|
+
|
|
/*
|
|
|
|
Local variables:
|
|
diff -N -urp a/gcc/passes.def b/gcc/passes.def
|
|
--- a/gcc/passes.def 2018-11-07 11:37:24.859223860 +0800
|
|
+++ b/gcc/passes.def 2018-11-07 11:38:26.171223860 +0800
|
|
@@ -272,6 +272,7 @@ along with GCC; see the file COPYING3.
|
|
NEXT_PASS (pass_tree_unswitch);
|
|
NEXT_PASS (pass_scev_cprop);
|
|
NEXT_PASS (pass_loop_split);
|
|
+ NEXT_PASS (pass_loop_jam);
|
|
/* All unswitching, final value replacement and splitting can expose
|
|
empty loops. Remove them now. */
|
|
NEXT_PASS (pass_cd_dce);
|
|
diff -N -urp a/gcc/timevar.def b/gcc/timevar.def
|
|
--- a/gcc/timevar.def 2018-11-07 11:37:24.935223860 +0800
|
|
+++ b/gcc/timevar.def 2018-11-07 11:38:26.175223860 +0800
|
|
@@ -186,6 +186,7 @@ DEFTIMEVAR (TV_TREE_LOOP_IVCANON , "
|
|
DEFTIMEVAR (TV_SCEV_CONST , "scev constant prop")
|
|
DEFTIMEVAR (TV_TREE_LOOP_UNSWITCH , "tree loop unswitching")
|
|
DEFTIMEVAR (TV_LOOP_SPLIT , "loop splitting")
|
|
+DEFTIMEVAR (TV_LOOP_JAM , "unroll and jam")
|
|
DEFTIMEVAR (TV_COMPLETE_UNROLL , "complete unrolling")
|
|
DEFTIMEVAR (TV_TREE_PARALLELIZE_LOOPS, "tree parallelize loops")
|
|
DEFTIMEVAR (TV_TREE_VECTORIZATION , "tree vectorization")
|
|
diff -N -urp a/gcc/tree-pass.h b/gcc/tree-pass.h
|
|
--- a/gcc/tree-pass.h 2018-11-07 11:37:24.887223860 +0800
|
|
+++ b/gcc/tree-pass.h 2018-11-07 11:38:26.175223860 +0800
|
|
@@ -369,6 +369,7 @@ extern gimple_opt_pass *make_pass_tree_l
|
|
extern gimple_opt_pass *make_pass_lim (gcc::context *ctxt);
|
|
extern gimple_opt_pass *make_pass_tree_unswitch (gcc::context *ctxt);
|
|
extern gimple_opt_pass *make_pass_loop_split (gcc::context *ctxt);
|
|
+extern gimple_opt_pass *make_pass_loop_jam (gcc::context *ctxt);
|
|
extern gimple_opt_pass *make_pass_predcom (gcc::context *ctxt);
|
|
extern gimple_opt_pass *make_pass_iv_canon (gcc::context *ctxt);
|
|
extern gimple_opt_pass *make_pass_scev_cprop (gcc::context *ctxt);
|