diff -N -urp a/gcc/Makefile.in b/gcc/Makefile.in --- a/gcc/Makefile.in 2018-11-07 11:37:24.615223860 +0800 +++ b/gcc/Makefile.in 2018-11-07 11:38:26.155223860 +0800 @@ -1292,6 +1292,7 @@ OBJS = \ gimple-iterator.o \ gimple-fold.o \ gimple-laddress.o \ + gimple-loop-jam.o \ gimple-low.o \ gimple-pretty-print.o \ gimple-ssa-backprop.o \ diff -N -urp a/gcc/cfgloop.c b/gcc/cfgloop.c --- a/gcc/cfgloop.c 2018-11-07 11:37:24.947223860 +0800 +++ b/gcc/cfgloop.c 2018-11-07 11:38:26.155223860 +0800 @@ -296,13 +296,25 @@ establish_preds (struct loop *loop, stru /* Add LOOP to the loop hierarchy tree where FATHER is father of the added loop. If LOOP has some children, take care of that their - pred field will be initialized correctly. */ + pred field will be initialized correctly. If AFTER is non-null + then it's expected it's a pointer into FATHERs inner sibling + list and LOOP is added behind AFTER, otherwise it's added in front + of FATHERs siblings. */ void -flow_loop_tree_node_add (struct loop *father, struct loop *loop) +flow_loop_tree_node_add (struct loop *father, struct loop *loop, + struct loop *after) { - loop->next = father->inner; - father->inner = loop; + if (after) + { + loop->next = after->next; + after->next = loop; + } + else + { + loop->next = father->inner; + father->inner = loop; + } establish_preds (loop, father); } diff -N -urp a/gcc/cfgloop.h b/gcc/cfgloop.h --- a/gcc/cfgloop.h 2018-11-07 11:37:24.331223860 +0800 +++ b/gcc/cfgloop.h 2018-11-07 11:38:26.155223860 +0800 @@ -324,7 +324,8 @@ void record_loop_exits (void); void rescan_loop_exit (edge, bool, bool); /* Loop data structure manipulation/querying. */ -extern void flow_loop_tree_node_add (struct loop *, struct loop *); +extern void flow_loop_tree_node_add (struct loop *, struct loop *, + struct loop * = NULL); extern void flow_loop_tree_node_remove (struct loop *); extern bool flow_loop_nested_p (const struct loop *, const struct loop *); extern bool flow_bb_inside_loop_p (const struct loop *, const_basic_block); diff -N -urp a/gcc/cfgloopmanip.c b/gcc/cfgloopmanip.c --- a/gcc/cfgloopmanip.c 2018-11-07 11:37:24.847223860 +0800 +++ b/gcc/cfgloopmanip.c 2018-11-07 11:38:26.155223860 +0800 @@ -1026,9 +1026,11 @@ copy_loop_info (struct loop *loop, struc } /* Copies copy of LOOP as subloop of TARGET loop, placing newly - created loop into loops structure. */ + created loop into loops structure. If AFTER is non-null + the new loop is added at AFTER->next, otherwise in front of TARGETs + sibling list. */ struct loop * -duplicate_loop (struct loop *loop, struct loop *target) +duplicate_loop (struct loop *loop, struct loop *target, struct loop *after) { struct loop *cloop; cloop = alloc_loop (); @@ -1040,36 +1042,46 @@ duplicate_loop (struct loop *loop, struc set_loop_copy (loop, cloop); /* Add it to target. */ - flow_loop_tree_node_add (target, cloop); + flow_loop_tree_node_add (target, cloop, after); return cloop; } /* Copies structure of subloops of LOOP into TARGET loop, placing - newly created loops into loop tree. */ + newly created loops into loop tree at the end of TARGETs sibling + list in the original order. */ void duplicate_subloops (struct loop *loop, struct loop *target) { - struct loop *aloop, *cloop; + struct loop *aloop, *cloop, *tail; + for (tail = target->inner; tail && tail->next; tail = tail->next) + ; for (aloop = loop->inner; aloop; aloop = aloop->next) { - cloop = duplicate_loop (aloop, target); + cloop = duplicate_loop (aloop, target, tail); + tail = cloop; + gcc_assert (!tail->next); duplicate_subloops (aloop, cloop); } } /* Copies structure of subloops of N loops, stored in array COPIED_LOOPS, - into TARGET loop, placing newly created loops into loop tree. */ + into TARGET loop, placing newly created loops into loop tree adding + them to TARGETs sibling list at the end in order. */ static void copy_loops_to (struct loop **copied_loops, int n, struct loop *target) { - struct loop *aloop; + struct loop *aloop, *tail; int i; + for (tail = target->inner; tail && tail->next; tail = tail->next) + ; for (i = 0; i < n; i++) { - aloop = duplicate_loop (copied_loops[i], target); + aloop = duplicate_loop (copied_loops[i], target, tail); + tail = aloop; + gcc_assert (!tail->next); duplicate_subloops (copied_loops[i], aloop); } } @@ -1133,14 +1145,15 @@ set_zero_probability (edge e) } /* Duplicates body of LOOP to given edge E NDUPL times. Takes care of updating - loop structure and dominators. E's destination must be LOOP header for - this to work, i.e. it must be entry or latch edge of this loop; these are - unique, as the loops must have preheaders for this function to work - correctly (in case E is latch, the function unrolls the loop, if E is entry - edge, it peels the loop). Store edges created by copying ORIG edge from - copies corresponding to set bits in WONT_EXIT bitmap (bit 0 corresponds to - original LOOP body, the other copies are numbered in order given by control - flow through them) into TO_REMOVE array. Returns false if duplication is + loop structure and dominators (order of inner subloops is retained). + E's destination must be LOOP header for this to work, i.e. it must be entry + or latch edge of this loop; these are unique, as the loops must have + preheaders for this function to work correctly (in case E is latch, the + function unrolls the loop, if E is entry edge, it peels the loop). Store + edges created by copying ORIG edge from copies corresponding to set bits in + WONT_EXIT bitmap (bit 0 corresponds to original LOOP body, the other copies + are numbered in order given by control flow through them) into TO_REMOVE + array. Returns false if duplication is impossible. */ bool diff -N -urp a/gcc/cfgloopmanip.h b/gcc/cfgloopmanip.h --- a/gcc/cfgloopmanip.h 2018-11-07 11:37:24.939223860 +0800 +++ b/gcc/cfgloopmanip.h 2018-11-07 11:38:26.155223860 +0800 @@ -47,7 +47,8 @@ extern struct loop *loopify (edge, edge, unsigned, unsigned); extern void unloop (struct loop *, bool *, bitmap); extern void copy_loop_info (struct loop *loop, struct loop *target); -extern struct loop * duplicate_loop (struct loop *, struct loop *); +extern struct loop * duplicate_loop (struct loop *, struct loop *, + struct loop * = NULL); extern void duplicate_subloops (struct loop *, struct loop *); extern bool can_duplicate_loop_p (const struct loop *loop); extern bool duplicate_loop_to_header_edge (struct loop *, edge, diff -N -urp a/gcc/common.opt b/gcc/common.opt --- a/gcc/common.opt 2018-11-07 11:37:24.859223860 +0800 +++ b/gcc/common.opt 2018-11-07 11:38:26.159223860 +0800 @@ -1496,8 +1496,8 @@ Common Alias(floop-nest-optimize) Enable loop nest transforms. Same as -floop-nest-optimize. floop-unroll-and-jam -Common Alias(floop-nest-optimize) -Enable loop nest transforms. Same as -floop-nest-optimize. +Common Report Var(flag_unroll_jam) Optimization +Perform unroll-and-jam on loops. fgnu-tm Common Report Var(flag_tm) diff -N -urp a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi --- a/gcc/doc/invoke.texi 2018-11-07 11:37:24.915223860 +0800 +++ b/gcc/doc/invoke.texi 2018-11-07 11:39:49.031223860 +0800 @@ -7120,7 +7120,8 @@ Optimize yet more. @option{-O3} turns o by @option{-O2} and also turns on the @option{-finline-functions}, @option{-funswitch-loops}, @option{-fpredictive-commoning}, @option{-fgcse-after-reload}, @option{-ftree-loop-vectorize}, -@option{-ftree-loop-distribute-patterns}, @option{-fsplit-paths} +@option{-ftree-loop-distribute-patterns}, @option{-fsplit-paths}, +@option{-floop-unroll-and-jam}, @option{-ftree-slp-vectorize}, @option{-fvect-cost-model}, @option{-ftree-partial-pre}, @option{-fpeel-loops} and @option{-fipa-cp-clone} options. @@ -8226,12 +8227,10 @@ at @option{-O} and higher. @itemx -floop-interchange @itemx -floop-strip-mine @itemx -floop-block -@itemx -floop-unroll-and-jam @opindex ftree-loop-linear @opindex floop-interchange @opindex floop-strip-mine @opindex floop-block -@opindex floop-unroll-and-jam Perform loop nest optimizations. Same as @option{-floop-nest-optimize}. To use this code transformation, GCC has to be configured with @option{--with-isl} to enable the Graphite loop @@ -8323,6 +8322,12 @@ ENDDO @end smallexample and the initialization loop is transformed into a call to memset zero. +@item -floop-unroll-and-jam +@opindex floop-unroll-and-jam +Apply unroll and jam transformations on feasible loops. In a loop +nest this unrolls the outer loop by some factor and fuses the resulting +multiple inner loops. This flag is enabled by default at @option{-O3}. + @item -ftree-loop-im @opindex ftree-loop-im Perform loop invariant motion on trees. This pass moves only invariants that @@ -10353,13 +10358,13 @@ loop in the loop nest by a given number length can be changed using the @option{loop-block-tile-size} parameter. The default value is 51 iterations. -@item loop-unroll-jam-size -Specify the unroll factor for the @option{-floop-unroll-and-jam} option. The -default value is 4. - -@item loop-unroll-jam-depth -Specify the dimension to be unrolled (counting from the most inner loop) -for the @option{-floop-unroll-and-jam}. The default value is 2. +@item unroll-jam-min-percent +The minimum percentage of memory references that must be optimized +away for the unroll-and-jam transformation to be considered profitable. + +@item unroll-jam-max-unroll +The maximum number of times the outer loop should be unrolled by +the unroll-and-jam transformation. @item ipa-cp-value-list-size IPA-CP attempts to track all possible values and types passed to a function's diff -N -urp a/gcc/gimple-loop-jam.c b/gcc/gimple-loop-jam.c --- a/gcc/gimple-loop-jam.c 1970-01-01 08:00:00.000000000 +0800 +++ b/gcc/gimple-loop-jam.c 2018-11-07 11:38:26.167223860 +0800 @@ -0,0 +1,598 @@ +/* Loop unroll-and-jam. + Copyright (C) 2017-2018 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +GCC is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "params.h" +#include "tree-pass.h" +#include "backend.h" +#include "tree.h" +#include "gimple.h" +#include "ssa.h" +#include "fold-const.h" +#include "tree-cfg.h" +#include "tree-ssa.h" +#include "tree-ssa-loop-niter.h" +#include "tree-ssa-loop.h" +#include "tree-ssa-loop-manip.h" +#include "cfgloop.h" +#include "tree-scalar-evolution.h" +#include "gimple-iterator.h" +#include "cfghooks.h" +#include "tree-data-ref.h" +#include "tree-ssa-loop-ivopts.h" +#include "tree-vectorizer.h" + +/* Unroll and Jam transformation + + This is a combination of two transformations, where the second + is not always valid. It's applicable if a loop nest has redundancies + over the iterations of an outer loop while not having that with + an inner loop. + + Given this nest: + for (i) { + for (j) { + B (i,j) + } + } + + first unroll: + for (i by 2) { + for (j) { + B (i,j) + } + for (j) { + B (i+1,j) + } + } + + then fuse the two adjacent inner loops resulting from that: + for (i by 2) { + for (j) { + B (i,j) + B (i+1,j) + } + } + + As the order of evaluations of the body B changes this is valid + only in certain situations: all distance vectors need to be forward. + Additionally if there are multiple induction variables than just + a counting control IV (j above) we can also deal with some situations. + + The validity is checked by unroll_jam_possible_p, and the data-dep + testing below. + + A trivial example where the fusion is wrong would be when + B (i,j) == x[j-1] = x[j]; + for (i by 2) { + for (j) { + x[j-1] = x[j]; + } + for (j) { + x[j-1] = x[j]; + } + } effect: move content to front by two elements + --> + for (i by 2) { + for (j) { + x[j-1] = x[j]; + x[j-1] = x[j]; + } + } effect: move content to front by one element +*/ + +/* Modify the loop tree for the fact that all code once belonging + to the OLD loop or the outer loop of OLD now is inside LOOP. */ + +static void +merge_loop_tree (struct loop *loop, struct loop *old) +{ + basic_block *bbs; + int i, n; + struct loop *subloop; + edge e; + edge_iterator ei; + + /* Find its nodes. */ + bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun)); + n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun)); + + for (i = 0; i < n; i++) + { + /* If the block was direct child of OLD loop it's now part + of LOOP. If it was outside OLD, then it moved into LOOP + as well. This avoids changing the loop father for BBs + in inner loops of OLD. */ + if (bbs[i]->loop_father == old + || loop_depth (bbs[i]->loop_father) < loop_depth (old)) + { + remove_bb_from_loops (bbs[i]); + add_bb_to_loop (bbs[i], loop); + continue; + } + + /* If we find a direct subloop of OLD, move it to LOOP. */ + subloop = bbs[i]->loop_father; + if (loop_outer (subloop) == old && subloop->header == bbs[i]) + { + flow_loop_tree_node_remove (subloop); + flow_loop_tree_node_add (loop, subloop); + } + } + + /* Update the information about loop exit edges. */ + for (i = 0; i < n; i++) + { + FOR_EACH_EDGE (e, ei, bbs[i]->succs) + { + rescan_loop_exit (e, false, false); + } + } + + loop->num_nodes = n; + + free (bbs); +} + +/* BB is part of the outer loop of an unroll-and-jam situation. + Check if any statements therein would prevent the transformation. */ + +static bool +bb_prevents_fusion_p (basic_block bb) +{ + gimple_stmt_iterator gsi; + /* BB is duplicated by outer unrolling and then all N-1 first copies + move into the body of the fused inner loop. If BB exits the outer loop + the last copy still does so, and the first N-1 copies are cancelled + by loop unrolling, so also after fusion it's the exit block. + But there might be other reasons that prevent fusion: + * stores or unknown side-effects prevent fusion + * loads don't + * computations into SSA names: these aren't problematic. Their + result will be unused on the exit edges of the first N-1 copies + (those aren't taken after unrolling). If they are used on the + other edge (the one leading to the outer latch block) they are + loop-carried (on the outer loop) and the Nth copy of BB will + compute them again (i.e. the first N-1 copies will be dead). */ + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple *g = gsi_stmt (gsi); + if (gimple_vdef (g) || gimple_has_side_effects (g)) + return true; + } + return false; +} + +/* Given an inner loop LOOP (of some OUTER loop) determine if + we can safely fuse copies of it (generated by outer unrolling). + If so return true, otherwise return false. */ + +static bool +unroll_jam_possible_p (struct loop *outer, struct loop *loop) +{ + basic_block *bbs; + int i, n; + struct tree_niter_desc niter; + + /* When fusing the loops we skip the latch block + of the first one, so it mustn't have any effects to + preserve. */ + if (!empty_block_p (loop->latch)) + return false; + + if (!single_exit (loop)) + return false; + + /* We need a perfect nest. Quick check for adjacent inner loops. */ + if (outer->inner != loop || loop->next) + return false; + + /* Prevent head-controlled inner loops, that we usually have. + The guard block would need to be accepted + (invariant condition either entering or skipping the loop), + without also accepting arbitrary control flow. When unswitching + ran before us (as with -O3) this won't be a problem because its + outer loop unswitching will have moved out the invariant condition. + + If we do that we need to extend fuse_loops () to cope with this + by threading through the (still invariant) copied condition + between the two loop copies. */ + if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header)) + return false; + + /* The number of iterations of the inner loop must be loop invariant + with respect to the outer loop. */ + if (!number_of_iterations_exit (loop, single_exit (loop), &niter, + false, true) + || niter.cmp == ERROR_MARK + || !integer_zerop (niter.may_be_zero) + || !expr_invariant_in_loop_p (outer, niter.niter)) + return false; + + /* If the inner loop produces any values that are used inside the + outer loop (except the virtual op) then it can flow + back (perhaps indirectly) into the inner loop. This prevents + fusion: without fusion the value at the last iteration is used, + with fusion the value after the initial iteration is used. + + If all uses are outside the outer loop this doesn't prevent fusion; + the value of the last iteration is still used (and the values from + all intermediate iterations are dead). */ + gphi_iterator psi; + for (psi = gsi_start_phis (single_exit (loop)->dest); + !gsi_end_p (psi); gsi_next (&psi)) + { + imm_use_iterator imm_iter; + use_operand_p use_p; + tree op = gimple_phi_result (psi.phi ()); + if (virtual_operand_p (op)) + continue; + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op) + { + gimple *use_stmt = USE_STMT (use_p); + if (!is_gimple_debug (use_stmt) + && flow_bb_inside_loop_p (outer, gimple_bb (use_stmt))) + return false; + } + } + + /* And check blocks belonging to just outer loop. */ + bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun)); + n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun)); + + for (i = 0; i < n; i++) + if (bbs[i]->loop_father == outer && bb_prevents_fusion_p (bbs[i])) + break; + free (bbs); + if (i != n) + return false; + + /* For now we can safely fuse copies of LOOP only if all + loop carried variables are inductions (or the virtual op). + + We could handle reductions as well (the initial value in the second + body would be the after-iter value of the first body) if it's over + an associative and commutative operation. We wouldn't + be able to handle unknown cycles. */ + for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi)) + { + affine_iv iv; + tree op = gimple_phi_result (psi.phi ()); + + if (virtual_operand_p (op)) + continue; + if (!simple_iv (loop, loop, op, &iv, true)) + return false; + /* The inductions must be regular, loop invariant step and initial + value. */ + if (!expr_invariant_in_loop_p (outer, iv.step) + || !expr_invariant_in_loop_p (outer, iv.base)) + return false; + /* XXX With more effort we could also be able to deal with inductions + where the initial value is loop variant but a simple IV in the + outer loop. The initial value for the second body would be + the original initial value plus iv.base.step. The next value + for the fused loop would be the original next value of the first + copy, _not_ the next value of the second body. */ + } + + return true; +} + +/* Fuse LOOP with all further neighbors. The loops are expected to + be in appropriate form. */ + +static void +fuse_loops (struct loop *loop) +{ + struct loop *next = loop->next; + + while (next) + { + edge e; + + remove_branch (single_pred_edge (loop->latch)); + /* Make delete_basic_block not fiddle with the loop structure. */ + basic_block oldlatch = loop->latch; + loop->latch = NULL; + delete_basic_block (oldlatch); + e = redirect_edge_and_branch (loop_latch_edge (next), + loop->header); + loop->latch = e->src; + flush_pending_stmts (e); + + gcc_assert (EDGE_COUNT (next->header->preds) == 1); + + /* The PHI nodes of the second body (single-argument now) + need adjustments to use the right values: either directly + the value of the corresponding PHI in the first copy or + the one leaving the first body which unrolling did for us. + + See also unroll_jam_possible_p () for further possibilities. */ + gphi_iterator psi_first, psi_second; + e = single_pred_edge (next->header); + for (psi_first = gsi_start_phis (loop->header), + psi_second = gsi_start_phis (next->header); + !gsi_end_p (psi_first); + gsi_next (&psi_first), gsi_next (&psi_second)) + { + gphi *phi_first = psi_first.phi (); + gphi *phi_second = psi_second.phi (); + tree firstop = gimple_phi_result (phi_first); + /* The virtual operand is correct already as it's + always live at exit, hence has a LCSSA node and outer + loop unrolling updated SSA form. */ + if (virtual_operand_p (firstop)) + continue; + + /* Due to unroll_jam_possible_p () we know that this is + an induction. The second body goes over the same + iteration space. */ + add_phi_arg (phi_second, firstop, e, + gimple_location (phi_first)); + } + gcc_assert (gsi_end_p (psi_second)); + + merge_loop_tree (loop, next); + gcc_assert (!next->num_nodes); + struct loop *ln = next->next; + delete_loop (next); + next = ln; + } + rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop); +} + +/* Returns true if the distance in DDR can be determined and adjusts + the unroll factor in *UNROLL to make unrolling valid for that distance. + Otherwise return false. + + If this data dep can lead to a removed memory reference, increment + *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor + for this to happen. */ + +static bool +adjust_unroll_factor (struct data_dependence_relation *ddr, + unsigned *unroll, unsigned *profit_unroll, + unsigned *removed) +{ + bool ret = false; + if (DDR_ARE_DEPENDENT (ddr) != chrec_known) + { + if (DDR_NUM_DIST_VECTS (ddr) == 0) + return false; + unsigned i; + lambda_vector dist_v; + FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) + { + /* A distance (a,b) is at worst transformed into (a/N,b) by the + unrolling (factor N), so the transformation is valid if + a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll + factor needs to be limited so that the first condition holds. + That may limit the factor down to zero in the worst case. */ + int dist = dist_v[0]; + if (dist < 0) + gcc_unreachable (); + else if ((unsigned)dist >= *unroll) + ; + else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1) + || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1) + && dist > 0)) + ; + else + *unroll = dist; + + /* With a distance (a,0) it's always profitable to unroll-and-jam + (by a+1), because one memory reference will go away. With + (a,b) and b != 0 that's less clear. We will increase the + number of streams without lowering the number of mem refs. + So for now only handle the first situation. */ + if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)) + { + *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1); + (*removed)++; + } + + ret = true; + } + } + return ret; +} + +/* Main entry point for the unroll-and-jam transformation + described above. */ + +static unsigned int +tree_loop_unroll_and_jam (void) +{ + struct loop *loop; + bool changed = false; + + gcc_assert (scev_initialized_p ()); + + /* Go through all innermost loops. */ + FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) + { + struct loop *outer = loop_outer (loop); + + if (loop_depth (loop) < 2 + || optimize_loop_nest_for_size_p (outer)) + continue; + + if (!unroll_jam_possible_p (outer, loop)) + continue; + + vec datarefs; + vec dependences; + unsigned unroll_factor, profit_unroll, removed; + struct tree_niter_desc desc; + bool unroll = false; + + auto_vec loop_nest; + dependences.create (10); + datarefs.create (10); + if (!compute_data_dependences_for_loop (outer, true, &loop_nest, + &datarefs, &dependences)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Cannot analyze data dependencies\n"); + free_data_refs (datarefs); + free_dependence_relations (dependences); + return false; + } + if (!datarefs.length ()) + continue; + + if (dump_file && (dump_flags & TDF_DETAILS)) + dump_data_dependence_relations (dump_file, dependences); + + unroll_factor = (unsigned)-1; + profit_unroll = 1; + removed = 0; + + /* Check all dependencies. */ + unsigned i; + struct data_dependence_relation *ddr; + FOR_EACH_VEC_ELT (dependences, i, ddr) + { + struct data_reference *dra, *drb; + + /* If the refs are independend there's nothing to do. */ + if (DDR_ARE_DEPENDENT (ddr) == chrec_known) + continue; + dra = DDR_A (ddr); + drb = DDR_B (ddr); + /* Nothing interesting for the self dependencies. */ + if (dra == drb) + continue; + + /* Now check the distance vector, for determining a sensible + outer unroll factor, and for validity of merging the inner + loop copies. */ + if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll, + &removed)) + { + /* Couldn't get the distance vector. For two reads that's + harmless (we assume we should unroll). For at least + one write this means we can't check the dependence direction + and hence can't determine safety. */ + + if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb)) + { + unroll_factor = 0; + break; + } + } + } + + /* We regard a user-specified minimum percentage of zero as a request + to ignore all profitability concerns and apply the transformation + always. */ + if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT)) + profit_unroll = 2; + else if (removed * 100 / datarefs.length () + < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT)) + profit_unroll = 1; + if (unroll_factor > profit_unroll) + unroll_factor = profit_unroll; + if (unroll_factor > (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL)) + unroll_factor = PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL); + unroll = (unroll_factor > 1 + && can_unroll_loop_p (outer, unroll_factor, &desc)); + + if (unroll) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS, + find_loop_location (outer), + "applying unroll and jam with factor %d\n", + unroll_factor); + initialize_original_copy_tables (); + tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer), + &desc); + free_original_copy_tables (); + fuse_loops (outer->inner); + changed = true; + } + + loop_nest.release (); + free_dependence_relations (dependences); + free_data_refs (datarefs); + } + + if (changed) + { + scev_reset (); + free_dominance_info (CDI_DOMINATORS); + return TODO_cleanup_cfg; + } + return 0; +} + +/* Pass boilerplate. */ + +namespace { + +const pass_data pass_data_loop_jam = +{ + GIMPLE_PASS, /* type. */ + "unrolljam", /* name. */ + OPTGROUP_LOOP, /* optinfo_flags. */ + TV_LOOP_JAM, /* tv_id. */ + PROP_cfg, /* properties_required. */ + 0, /* properties_provided. */ + 0, /* properties_destroyed. */ + 0, /* todo_flags_start. */ + 0, /* todo_flags_finish. */ +}; + +class pass_loop_jam : public gimple_opt_pass +{ +public: + pass_loop_jam (gcc::context *ctxt) + : gimple_opt_pass (pass_data_loop_jam, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *) + { + return flag_unroll_jam != 0; + } + virtual unsigned int execute (function *); + +}; + +unsigned int +pass_loop_jam::execute (function *fun) +{ + if (number_of_loops (fun) <= 1) + return 0; + + return tree_loop_unroll_and_jam (); +} + +} + +gimple_opt_pass * +make_pass_loop_jam (gcc::context *ctxt) +{ + return new pass_loop_jam (ctxt); +} + diff -N -urp a/gcc/opts.c b/gcc/opts.c --- a/gcc/opts.c 2018-11-07 11:37:24.891223860 +0800 +++ b/gcc/opts.c 2018-11-07 11:38:26.171223860 +0800 @@ -534,6 +534,7 @@ static const struct default_options defa { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_finline_functions_called_once, NULL, 1 }, { OPT_LEVELS_3_PLUS, OPT_fsplit_loops, NULL, 1 }, { OPT_LEVELS_3_PLUS, OPT_funswitch_loops, NULL, 1 }, + { OPT_LEVELS_3_PLUS, OPT_floop_unroll_and_jam, NULL, 1 }, { OPT_LEVELS_3_PLUS, OPT_fgcse_after_reload, NULL, 1 }, { OPT_LEVELS_3_PLUS, OPT_ftree_loop_vectorize, NULL, 1 }, { OPT_LEVELS_3_PLUS, OPT_ftree_slp_vectorize, NULL, 1 }, diff -N -urp a/gcc/params.def b/gcc/params.def --- a/gcc/params.def 2018-11-07 11:37:27.543223860 +0800 +++ b/gcc/params.def 2018-11-07 11:38:26.171223860 +0800 @@ -1280,6 +1280,16 @@ DEFPARAM (PARAM_VECT_EPILOGUES_NOMASK, "Enable loop epilogue vectorization using smaller vector size.", 0, 0, 1) +DEFPARAM (PARAM_UNROLL_JAM_MIN_PERCENT, + "unroll-jam-min-percent", + "Minimum percentage of memrefs that must go away for unroll-and-jam to be considered profitable.", + 1, 0, 100) + +DEFPARAM (PARAM_UNROLL_JAM_MAX_UNROLL, + "unroll-jam-max-unroll", + "Maximum unroll factor for the unroll-and-jam transformation.", + 4, 0, 0) + /* Local variables: diff -N -urp a/gcc/passes.def b/gcc/passes.def --- a/gcc/passes.def 2018-11-07 11:37:24.859223860 +0800 +++ b/gcc/passes.def 2018-11-07 11:38:26.171223860 +0800 @@ -272,6 +272,7 @@ along with GCC; see the file COPYING3. NEXT_PASS (pass_tree_unswitch); NEXT_PASS (pass_scev_cprop); NEXT_PASS (pass_loop_split); + NEXT_PASS (pass_loop_jam); /* All unswitching, final value replacement and splitting can expose empty loops. Remove them now. */ NEXT_PASS (pass_cd_dce); diff -N -urp a/gcc/timevar.def b/gcc/timevar.def --- a/gcc/timevar.def 2018-11-07 11:37:24.935223860 +0800 +++ b/gcc/timevar.def 2018-11-07 11:38:26.175223860 +0800 @@ -186,6 +186,7 @@ DEFTIMEVAR (TV_TREE_LOOP_IVCANON , " DEFTIMEVAR (TV_SCEV_CONST , "scev constant prop") DEFTIMEVAR (TV_TREE_LOOP_UNSWITCH , "tree loop unswitching") DEFTIMEVAR (TV_LOOP_SPLIT , "loop splitting") +DEFTIMEVAR (TV_LOOP_JAM , "unroll and jam") DEFTIMEVAR (TV_COMPLETE_UNROLL , "complete unrolling") DEFTIMEVAR (TV_TREE_PARALLELIZE_LOOPS, "tree parallelize loops") DEFTIMEVAR (TV_TREE_VECTORIZATION , "tree vectorization") diff -N -urp a/gcc/tree-pass.h b/gcc/tree-pass.h --- a/gcc/tree-pass.h 2018-11-07 11:37:24.887223860 +0800 +++ b/gcc/tree-pass.h 2018-11-07 11:38:26.175223860 +0800 @@ -369,6 +369,7 @@ extern gimple_opt_pass *make_pass_tree_l extern gimple_opt_pass *make_pass_lim (gcc::context *ctxt); extern gimple_opt_pass *make_pass_tree_unswitch (gcc::context *ctxt); extern gimple_opt_pass *make_pass_loop_split (gcc::context *ctxt); +extern gimple_opt_pass *make_pass_loop_jam (gcc::context *ctxt); extern gimple_opt_pass *make_pass_predcom (gcc::context *ctxt); extern gimple_opt_pass *make_pass_iv_canon (gcc::context *ctxt); extern gimple_opt_pass *make_pass_scev_cprop (gcc::context *ctxt);