gcc/floop-unroll-and-jam.patch

diff -N -urp a/gcc/Makefile.in b/gcc/Makefile.in
--- a/gcc/Makefile.in	2018-11-07 11:37:24.615223860 +0800
+++ b/gcc/Makefile.in	2018-11-07 11:38:26.155223860 +0800
@@ -1292,6 +1292,7 @@ OBJS = \
 	gimple-iterator.o \
 	gimple-fold.o \
 	gimple-laddress.o \
+	gimple-loop-jam.o \
 	gimple-low.o \
 	gimple-pretty-print.o \
 	gimple-ssa-backprop.o \
diff -N -urp a/gcc/cfgloop.c b/gcc/cfgloop.c
--- a/gcc/cfgloop.c	2018-11-07 11:37:24.947223860 +0800
+++ b/gcc/cfgloop.c	2018-11-07 11:38:26.155223860 +0800
@@ -296,13 +296,25 @@ establish_preds (struct loop *loop, stru

 /* Add LOOP to the loop hierarchy tree where FATHER is father of the
    added loop.  If LOOP has some children, take care of that their
-   pred field will be initialized correctly.  */
+   pred field will be initialized correctly.  If AFTER is non-null
+   then it's expected it's a pointer into FATHERs inner sibling
+   list and LOOP is added behind AFTER, otherwise it's added in front
+   of FATHERs siblings.  */

 void
-flow_loop_tree_node_add (struct loop *father, struct loop *loop)
+flow_loop_tree_node_add (struct loop *father, struct loop *loop,
+					      struct loop *after)
 {
-  loop->next = father->inner;
-  father->inner = loop;
+  if (after)
+    {
+      loop->next = after->next;
+      after->next = loop;
+    }
+  else
+    {
+      loop->next = father->inner;
+      father->inner = loop;
+    }

   establish_preds (loop, father);
 }
diff -N -urp a/gcc/cfgloop.h b/gcc/cfgloop.h
--- a/gcc/cfgloop.h	2018-11-07 11:37:24.331223860 +0800
+++ b/gcc/cfgloop.h	2018-11-07 11:38:26.155223860 +0800
@@ -324,7 +324,8 @@ void record_loop_exits (void);
 void rescan_loop_exit (edge, bool, bool);

 /* Loop data structure manipulation/querying.  */
-extern void flow_loop_tree_node_add (struct loop *, struct loop *);
+extern void flow_loop_tree_node_add (struct loop *, struct loop *,
+						    struct loop * = NULL);
 extern void flow_loop_tree_node_remove (struct loop *);
 extern bool flow_loop_nested_p	(const struct loop *, const struct loop *);
 extern bool flow_bb_inside_loop_p (const struct loop *, const_basic_block);
diff -N -urp a/gcc/cfgloopmanip.c b/gcc/cfgloopmanip.c
--- a/gcc/cfgloopmanip.c	2018-11-07 11:37:24.847223860 +0800
+++ b/gcc/cfgloopmanip.c	2018-11-07 11:38:26.155223860 +0800
@@ -1026,9 +1026,11 @@ copy_loop_info (struct loop *loop, struc
 }

 /* Copies copy of LOOP as subloop of TARGET loop, placing newly
-   created loop into loops structure.  */
+   created loop into loops structure.  If AFTER is non-null
+   the new loop is added at AFTER->next, otherwise in front of TARGETs
+   sibling list.  */
 struct loop *
-duplicate_loop (struct loop *loop, struct loop *target)
+duplicate_loop (struct loop *loop, struct loop *target, struct loop *after)
 {
   struct loop *cloop;
   cloop = alloc_loop ();
@@ -1040,36 +1042,46 @@ duplicate_loop (struct loop *loop, struc
   set_loop_copy (loop, cloop);

   /* Add it to target.  */
-  flow_loop_tree_node_add (target, cloop);
+  flow_loop_tree_node_add (target, cloop, after);

   return cloop;
 }

 /* Copies structure of subloops of LOOP into TARGET loop, placing
-   newly created loops into loop tree.  */
+   newly created loops into loop tree at the end of TARGETs sibling
+   list in the original order.  */
 void
 duplicate_subloops (struct loop *loop, struct loop *target)
 {
-  struct loop *aloop, *cloop;
+  struct loop *aloop, *cloop, *tail;

+  for (tail = target->inner; tail && tail->next; tail = tail->next)
+    ;
   for (aloop = loop->inner; aloop; aloop = aloop->next)
     {
-      cloop = duplicate_loop (aloop, target);
+      cloop = duplicate_loop (aloop, target, tail);
+      tail = cloop;
+      gcc_assert (!tail->next);
       duplicate_subloops (aloop, cloop);
     }
 }

 /* Copies structure of subloops of N loops, stored in array COPIED_LOOPS,
-   into TARGET loop, placing newly created loops into loop tree.  */
+   into TARGET loop, placing newly created loops into loop tree adding
+   them to TARGETs sibling list at the end in order.  */
 static void
 copy_loops_to (struct loop **copied_loops, int n, struct loop *target)
 {
-  struct loop *aloop;
+  struct loop *aloop, *tail;
   int i;

+  for (tail = target->inner; tail && tail->next; tail = tail->next)
+    ;
   for (i = 0; i < n; i++)
     {
-      aloop = duplicate_loop (copied_loops[i], target);
+      aloop = duplicate_loop (copied_loops[i], target, tail);
+      tail = aloop;
+      gcc_assert (!tail->next);
       duplicate_subloops (copied_loops[i], aloop);
     }
 }
@@ -1133,14 +1145,15 @@ set_zero_probability (edge e)
 }

 /* Duplicates body of LOOP to given edge E NDUPL times.  Takes care of updating
-   loop structure and dominators.  E's destination must be LOOP header for
-   this to work, i.e. it must be entry or latch edge of this loop; these are
-   unique, as the loops must have preheaders for this function to work
-   correctly (in case E is latch, the function unrolls the loop, if E is entry
-   edge, it peels the loop).  Store edges created by copying ORIG edge from
-   copies corresponding to set bits in WONT_EXIT bitmap (bit 0 corresponds to
-   original LOOP body, the other copies are numbered in order given by control
-   flow through them) into TO_REMOVE array.  Returns false if duplication is
+   loop structure and dominators (order of inner subloops is retained).
+   E's destination must be LOOP header for this to work, i.e. it must be entry
+   or latch edge of this loop; these are unique, as the loops must have
+   preheaders for this function to work correctly (in case E is latch, the
+   function unrolls the loop, if E is entry edge, it peels the loop).  Store
+   edges created by copying ORIG edge from copies corresponding to set bits in
+   WONT_EXIT bitmap (bit 0 corresponds to original LOOP body, the other copies
+   are numbered in order given by control flow through them) into TO_REMOVE
+   array.  Returns false if duplication is
    impossible.  */

 bool
diff -N -urp a/gcc/cfgloopmanip.h b/gcc/cfgloopmanip.h
--- a/gcc/cfgloopmanip.h	2018-11-07 11:37:24.939223860 +0800
+++ b/gcc/cfgloopmanip.h	2018-11-07 11:38:26.155223860 +0800
@@ -47,7 +47,8 @@ extern struct loop *loopify (edge, edge,
 			     unsigned, unsigned);
 extern void unloop (struct loop *, bool *, bitmap);
 extern void copy_loop_info (struct loop *loop, struct loop *target);
-extern struct loop * duplicate_loop (struct loop *, struct loop *);
+extern struct loop * duplicate_loop (struct loop *, struct loop *,
+						    struct loop * = NULL);
 extern void duplicate_subloops (struct loop *, struct loop *);
 extern bool can_duplicate_loop_p (const struct loop *loop);
 extern bool duplicate_loop_to_header_edge (struct loop *, edge,
diff -N -urp a/gcc/common.opt b/gcc/common.opt
--- a/gcc/common.opt	2018-11-07 11:37:24.859223860 +0800
+++ b/gcc/common.opt	2018-11-07 11:38:26.159223860 +0800
@@ -1496,8 +1496,8 @@ Common Alias(floop-nest-optimize)
 Enable loop nest transforms.  Same as -floop-nest-optimize.

 floop-unroll-and-jam
-Common Alias(floop-nest-optimize)
-Enable loop nest transforms.  Same as -floop-nest-optimize.
+Common Report Var(flag_unroll_jam) Optimization
+Perform unroll-and-jam on loops.

 fgnu-tm
 Common Report Var(flag_tm)
diff -N -urp a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
--- a/gcc/doc/invoke.texi	2018-11-07 11:37:24.915223860 +0800
+++ b/gcc/doc/invoke.texi	2018-11-07 11:39:49.031223860 +0800
@@ -7120,7 +7120,8 @@ Optimize yet more.  @option{-O3} turns o
 by @option{-O2} and also turns on the @option{-finline-functions},
 @option{-funswitch-loops}, @option{-fpredictive-commoning},
 @option{-fgcse-after-reload}, @option{-ftree-loop-vectorize},
-@option{-ftree-loop-distribute-patterns}, @option{-fsplit-paths}
+@option{-ftree-loop-distribute-patterns}, @option{-fsplit-paths},
+@option{-floop-unroll-and-jam},
 @option{-ftree-slp-vectorize}, @option{-fvect-cost-model},
 @option{-ftree-partial-pre}, @option{-fpeel-loops}
 and @option{-fipa-cp-clone} options.
@@ -8226,12 +8227,10 @@ at @option{-O} and higher.
 @itemx -floop-interchange
 @itemx -floop-strip-mine
 @itemx -floop-block
-@itemx -floop-unroll-and-jam
 @opindex ftree-loop-linear
 @opindex floop-interchange
 @opindex floop-strip-mine
 @opindex floop-block
-@opindex floop-unroll-and-jam
 Perform loop nest optimizations.  Same as
 @option{-floop-nest-optimize}.  To use this code transformation, GCC has
 to be configured with @option{--with-isl} to enable the Graphite loop
@@ -8323,6 +8322,12 @@ ENDDO
 @end smallexample
 and the initialization loop is transformed into a call to memset zero.

+@item -floop-unroll-and-jam
+@opindex floop-unroll-and-jam
+Apply unroll and jam transformations on feasible loops.  In a loop
+nest this unrolls the outer loop by some factor and fuses the resulting
+multiple inner loops.  This flag is enabled by default at @option{-O3}.
+
 @item -ftree-loop-im
 @opindex ftree-loop-im
 Perform loop invariant motion on trees.  This pass moves only invariants that
@@ -10353,13 +10358,13 @@ loop in the loop nest by a given number
 length can be changed using the @option{loop-block-tile-size}
 parameter.  The default value is 51 iterations.

-@item loop-unroll-jam-size
-Specify the unroll factor for the @option{-floop-unroll-and-jam} option.  The
-default value is 4.
-
-@item loop-unroll-jam-depth
-Specify the dimension to be unrolled (counting from the most inner loop)
-for the  @option{-floop-unroll-and-jam}.  The default value is 2.
+@item unroll-jam-min-percent
+The minimum percentage of memory references that must be optimized
+away for the unroll-and-jam transformation to be considered profitable.
+
+@item unroll-jam-max-unroll
+The maximum number of times the outer loop should be unrolled by
+the unroll-and-jam transformation.

 @item ipa-cp-value-list-size
 IPA-CP attempts to track all possible values and types passed to a function's
diff -N -urp a/gcc/gimple-loop-jam.c b/gcc/gimple-loop-jam.c
--- a/gcc/gimple-loop-jam.c	1970-01-01 08:00:00.000000000 +0800
+++ b/gcc/gimple-loop-jam.c	2018-11-07 11:38:26.167223860 +0800
@@ -0,0 +1,598 @@
+/* Loop unroll-and-jam.
+   Copyright (C) 2017-2018 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "params.h"
+#include "tree-pass.h"
+#include "backend.h"
+#include "tree.h"
+#include "gimple.h"
+#include "ssa.h"
+#include "fold-const.h"
+#include "tree-cfg.h"
+#include "tree-ssa.h"
+#include "tree-ssa-loop-niter.h"
+#include "tree-ssa-loop.h"
+#include "tree-ssa-loop-manip.h"
+#include "cfgloop.h"
+#include "tree-scalar-evolution.h"
+#include "gimple-iterator.h"
+#include "cfghooks.h"
+#include "tree-data-ref.h"
+#include "tree-ssa-loop-ivopts.h"
+#include "tree-vectorizer.h"
+
+/* Unroll and Jam transformation
+
+   This is a combination of two transformations, where the second
+   is not always valid.  It's applicable if a loop nest has redundancies
+   over the iterations of an outer loop while not having that with
+   an inner loop.
+
+   Given this nest:
+       for (i) {
+	 for (j) {
+	   B (i,j)
+	 }
+       }
+
+   first unroll:
+       for (i by 2) {
+	 for (j) {
+	   B (i,j)
+	 }
+	 for (j) {
+	   B (i+1,j)
+	 }
+       }
+
+   then fuse the two adjacent inner loops resulting from that:
+       for (i by 2) {
+	 for (j) {
+	   B (i,j)
+	   B (i+1,j)
+	 }
+       }
+
+   As the order of evaluations of the body B changes this is valid
+   only in certain situations: all distance vectors need to be forward.
+   Additionally if there are multiple induction variables than just
+   a counting control IV (j above) we can also deal with some situations.
+
+   The validity is checked by unroll_jam_possible_p, and the data-dep
+   testing below.
+
+   A trivial example where the fusion is wrong would be when
+   B (i,j) == x[j-1] = x[j];
+       for (i by 2) {
+	 for (j) {
+	   x[j-1] = x[j];
+	 }
+	 for (j) {
+	   x[j-1] = x[j];
+	 }
+       }  effect: move content to front by two elements
+       -->
+       for (i by 2) {
+	 for (j) {
+	   x[j-1] = x[j];
+	   x[j-1] = x[j];
+	 }
+       }  effect: move content to front by one element
+*/
+
+/* Modify the loop tree for the fact that all code once belonging
+   to the OLD loop or the outer loop of OLD now is inside LOOP.  */
+
+static void
+merge_loop_tree (struct loop *loop, struct loop *old)
+{
+  basic_block *bbs;
+  int i, n;
+  struct loop *subloop;
+  edge e;
+  edge_iterator ei;
+
+  /* Find its nodes.  */
+  bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
+  n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun));
+
+  for (i = 0; i < n; i++)
+    {
+      /* If the block was direct child of OLD loop it's now part
+	 of LOOP.  If it was outside OLD, then it moved into LOOP
+	 as well.  This avoids changing the loop father for BBs
+	 in inner loops of OLD.  */
+      if (bbs[i]->loop_father == old
+	  || loop_depth (bbs[i]->loop_father) < loop_depth (old))
+	{
+	  remove_bb_from_loops (bbs[i]);
+	  add_bb_to_loop (bbs[i], loop);
+	  continue;
+	}
+
+      /* If we find a direct subloop of OLD, move it to LOOP.  */
+      subloop = bbs[i]->loop_father;
+      if (loop_outer (subloop) == old && subloop->header == bbs[i])
+	{
+	  flow_loop_tree_node_remove (subloop);
+	  flow_loop_tree_node_add (loop, subloop);
+	}
+    }
+
+  /* Update the information about loop exit edges.  */
+  for (i = 0; i < n; i++)
+    {
+      FOR_EACH_EDGE (e, ei, bbs[i]->succs)
+	{
+	  rescan_loop_exit (e, false, false);
+	}
+    }
+
+  loop->num_nodes = n;
+
+  free (bbs);
+}
+
+/* BB is part of the outer loop of an unroll-and-jam situation.
+   Check if any statements therein would prevent the transformation.  */
+
+static bool
+bb_prevents_fusion_p (basic_block bb)
+{
+  gimple_stmt_iterator gsi;
+  /* BB is duplicated by outer unrolling and then all N-1 first copies
+     move into the body of the fused inner loop.  If BB exits the outer loop
+     the last copy still does so, and the first N-1 copies are cancelled
+     by loop unrolling, so also after fusion it's the exit block.
+     But there might be other reasons that prevent fusion:
+       * stores or unknown side-effects prevent fusion
+       * loads don't
+       * computations into SSA names: these aren't problematic.  Their
+	 result will be unused on the exit edges of the first N-1 copies
+	 (those aren't taken after unrolling).  If they are used on the
+	 other edge (the one leading to the outer latch block) they are
+	 loop-carried (on the outer loop) and the Nth copy of BB will
+	 compute them again (i.e. the first N-1 copies will be dead).  */
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple *g = gsi_stmt (gsi);
+      if (gimple_vdef (g) || gimple_has_side_effects (g))
+	return true;
+    }
+  return false;
+}
+
+/* Given an inner loop LOOP (of some OUTER loop) determine if
+   we can safely fuse copies of it (generated by outer unrolling).
+   If so return true, otherwise return false.  */
+
+static bool
+unroll_jam_possible_p (struct loop *outer, struct loop *loop)
+{
+  basic_block *bbs;
+  int i, n;
+  struct tree_niter_desc niter;
+
+  /* When fusing the loops we skip the latch block
+     of the first one, so it mustn't have any effects to
+     preserve.  */
+  if (!empty_block_p (loop->latch))
+    return false;
+
+  if (!single_exit (loop))
+    return false;
+
+  /* We need a perfect nest.  Quick check for adjacent inner loops.  */
+  if (outer->inner != loop || loop->next)
+    return false;
+
+  /* Prevent head-controlled inner loops, that we usually have.
+     The guard block would need to be accepted
+     (invariant condition either entering or skipping the loop),
+     without also accepting arbitrary control flow.  When unswitching
+     ran before us (as with -O3) this won't be a problem because its
+     outer loop unswitching will have moved out the invariant condition.
+
+     If we do that we need to extend fuse_loops () to cope with this
+     by threading through the (still invariant) copied condition
+     between the two loop copies.  */
+  if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header))
+    return false;
+
+  /* The number of iterations of the inner loop must be loop invariant
+     with respect to the outer loop.  */
+  if (!number_of_iterations_exit (loop, single_exit (loop), &niter,
+				 false, true)
+      || niter.cmp == ERROR_MARK
+      || !integer_zerop (niter.may_be_zero)
+      || !expr_invariant_in_loop_p (outer, niter.niter))
+    return false;
+
+  /* If the inner loop produces any values that are used inside the
+     outer loop (except the virtual op) then it can flow
+     back (perhaps indirectly) into the inner loop.  This prevents
+     fusion: without fusion the value at the last iteration is used,
+     with fusion the value after the initial iteration is used.
+
+     If all uses are outside the outer loop this doesn't prevent fusion;
+     the value of the last iteration is still used (and the values from
+     all intermediate iterations are dead).  */
+  gphi_iterator psi;
+  for (psi = gsi_start_phis (single_exit (loop)->dest);
+       !gsi_end_p (psi); gsi_next (&psi))
+    {
+      imm_use_iterator imm_iter;
+      use_operand_p use_p;
+      tree op = gimple_phi_result (psi.phi ());
+      if (virtual_operand_p (op))
+	continue;
+      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op)
+	{
+	  gimple *use_stmt = USE_STMT (use_p);
+	  if (!is_gimple_debug (use_stmt)
+	      && flow_bb_inside_loop_p (outer, gimple_bb (use_stmt)))
+	    return false;
+	}
+    }
+
+  /* And check blocks belonging to just outer loop.  */
+  bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
+  n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun));
+
+  for (i = 0; i < n; i++)
+    if (bbs[i]->loop_father == outer && bb_prevents_fusion_p (bbs[i]))
+      break;
+  free (bbs);
+  if (i != n)
+    return false;
+
+  /* For now we can safely fuse copies of LOOP only if all
+     loop carried variables are inductions (or the virtual op).
+
+     We could handle reductions as well (the initial value in the second
+     body would be the after-iter value of the first body) if it's over
+     an associative and commutative operation.  We wouldn't
+     be able to handle unknown cycles.  */
+  for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi))
+    {
+      affine_iv iv;
+      tree op = gimple_phi_result (psi.phi ());
+
+      if (virtual_operand_p (op))
+	continue;
+      if (!simple_iv (loop, loop, op, &iv, true))
+	return false;
+      /* The inductions must be regular, loop invariant step and initial
+	 value.  */
+      if (!expr_invariant_in_loop_p (outer, iv.step)
+	  || !expr_invariant_in_loop_p (outer, iv.base))
+	return false;
+      /* XXX With more effort we could also be able to deal with inductions
+	 where the initial value is loop variant but a simple IV in the
+	 outer loop.  The initial value for the second body would be
+	 the original initial value plus iv.base.step.  The next value
+	 for the fused loop would be the original next value of the first
+	 copy, _not_ the next value of the second body.  */
+    }
+
+  return true;
+}
+
+/* Fuse LOOP with all further neighbors.  The loops are expected to
+   be in appropriate form.  */
+
+static void
+fuse_loops (struct loop *loop)
+{
+  struct loop *next = loop->next;
+
+  while (next)
+    {
+      edge e;
+
+      remove_branch (single_pred_edge (loop->latch));
+      /* Make delete_basic_block not fiddle with the loop structure.  */
+      basic_block oldlatch = loop->latch;
+      loop->latch = NULL;
+      delete_basic_block (oldlatch);
+      e = redirect_edge_and_branch (loop_latch_edge (next),
+				    loop->header);
+      loop->latch = e->src;
+      flush_pending_stmts (e);
+
+      gcc_assert (EDGE_COUNT (next->header->preds) == 1);
+
+      /* The PHI nodes of the second body (single-argument now)
+	 need adjustments to use the right values: either directly
+	 the value of the corresponding PHI in the first copy or
+	 the one leaving the first body which unrolling did for us.
+
+	 See also unroll_jam_possible_p () for further possibilities.  */
+      gphi_iterator psi_first, psi_second;
+      e = single_pred_edge (next->header);
+      for (psi_first = gsi_start_phis (loop->header),
+	   psi_second = gsi_start_phis (next->header);
+	   !gsi_end_p (psi_first);
+	   gsi_next (&psi_first), gsi_next (&psi_second))
+	{
+	  gphi *phi_first = psi_first.phi ();
+	  gphi *phi_second = psi_second.phi ();
+	  tree firstop = gimple_phi_result (phi_first);
+	  /* The virtual operand is correct already as it's
+	     always live at exit, hence has a LCSSA node and outer
+	     loop unrolling updated SSA form.  */
+	  if (virtual_operand_p (firstop))
+	    continue;
+
+	  /* Due to unroll_jam_possible_p () we know that this is
+	     an induction.  The second body goes over the same
+	     iteration space.  */
+	  add_phi_arg (phi_second, firstop, e,
+		       gimple_location (phi_first));
+	}
+      gcc_assert (gsi_end_p (psi_second));
+
+      merge_loop_tree (loop, next);
+      gcc_assert (!next->num_nodes);
+      struct loop *ln = next->next;
+      delete_loop (next);
+      next = ln;
+    }
+  rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
+}
+
+/* Returns true if the distance in DDR can be determined and adjusts
+   the unroll factor in *UNROLL to make unrolling valid for that distance.
+   Otherwise return false.
+
+   If this data dep can lead to a removed memory reference, increment
+   *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
+   for this to happen.  */
+
+static bool
+adjust_unroll_factor (struct data_dependence_relation *ddr,
+		      unsigned *unroll, unsigned *profit_unroll,
+		      unsigned *removed)
+{
+  bool ret = false;
+  if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
+    {
+      if (DDR_NUM_DIST_VECTS (ddr) == 0)
+	return false;
+      unsigned i;
+      lambda_vector dist_v;
+      FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
+	{
+	  /* A distance (a,b) is at worst transformed into (a/N,b) by the
+	     unrolling (factor N), so the transformation is valid if
+	     a >= N, or b > 0, or b is zero and a > 0.  Otherwise the unroll
+	     factor needs to be limited so that the first condition holds.
+	     That may limit the factor down to zero in the worst case.  */
+	  int dist = dist_v[0];
+	  if (dist < 0)
+	    gcc_unreachable ();
+	  else if ((unsigned)dist >= *unroll)
+	    ;
+	  else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
+		   || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
+		       && dist > 0))
+	    ;
+	  else
+	    *unroll = dist;
+
+	  /* With a distance (a,0) it's always profitable to unroll-and-jam
+	     (by a+1), because one memory reference will go away.  With
+	     (a,b) and b != 0 that's less clear.  We will increase the
+	     number of streams without lowering the number of mem refs.
+	     So for now only handle the first situation.  */
+	  if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
+	    {
+	      *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1);
+	      (*removed)++;
+	    }
+
+	  ret = true;
+	}
+    }
+  return ret;
+}
+
+/* Main entry point for the unroll-and-jam transformation
+   described above.  */
+
+static unsigned int
+tree_loop_unroll_and_jam (void)
+{
+  struct loop *loop;
+  bool changed = false;
+
+  gcc_assert (scev_initialized_p ());
+
+  /* Go through all innermost loops.  */
+  FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
+    {
+      struct loop *outer = loop_outer (loop);
+
+      if (loop_depth (loop) < 2
+	  || optimize_loop_nest_for_size_p (outer))
+	continue;
+
+      if (!unroll_jam_possible_p (outer, loop))
+	continue;
+
+      vec<data_reference_p> datarefs;
+      vec<ddr_p> dependences;
+      unsigned unroll_factor, profit_unroll, removed;
+      struct tree_niter_desc desc;
+      bool unroll = false;
+
+      auto_vec<loop_p, 3> loop_nest;
+      dependences.create (10);
+      datarefs.create (10);
+      if (!compute_data_dependences_for_loop (outer, true, &loop_nest,
+					       &datarefs, &dependences))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "Cannot analyze data dependencies\n");
+	  free_data_refs (datarefs);
+	  free_dependence_relations (dependences);
+	  return false;
+	}
+      if (!datarefs.length ())
+	continue;
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	dump_data_dependence_relations (dump_file, dependences);
+
+      unroll_factor = (unsigned)-1;
+      profit_unroll = 1;
+      removed = 0;
+
+      /* Check all dependencies.  */
+      unsigned i;
+      struct data_dependence_relation *ddr;
+      FOR_EACH_VEC_ELT (dependences, i, ddr)
+	{
+	  struct data_reference *dra, *drb;
+
+	  /* If the refs are independend there's nothing to do.  */
+	  if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
+	    continue;
+	  dra = DDR_A (ddr);
+	  drb = DDR_B (ddr);
+	  /* Nothing interesting for the self dependencies.  */
+	  if (dra == drb)
+	    continue;
+
+	  /* Now check the distance vector, for determining a sensible
+	     outer unroll factor, and for validity of merging the inner
+	     loop copies.  */
+	  if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll,
+				     &removed))
+	    {
+	      /* Couldn't get the distance vector.  For two reads that's
+		 harmless (we assume we should unroll).  For at least
+		 one write this means we can't check the dependence direction
+		 and hence can't determine safety.  */
+
+	      if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb))
+		{
+		  unroll_factor = 0;
+		  break;
+		}
+	    }
+	}
+
+      /* We regard a user-specified minimum percentage of zero as a request
+	 to ignore all profitability concerns and apply the transformation
+	 always.  */
+      if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
+	profit_unroll = 2;
+      else if (removed * 100 / datarefs.length ()
+	  < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
+	profit_unroll = 1;
+      if (unroll_factor > profit_unroll)
+	unroll_factor = profit_unroll;
+      if (unroll_factor > (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL))
+	unroll_factor = PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL);
+      unroll = (unroll_factor > 1
+		&& can_unroll_loop_p (outer, unroll_factor, &desc));
+
+      if (unroll)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS,
+			     find_loop_location (outer),
+			     "applying unroll and jam with factor %d\n",
+			     unroll_factor);
+	  initialize_original_copy_tables ();
+	  tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer),
+			    &desc);
+	  free_original_copy_tables ();
+	  fuse_loops (outer->inner);
+	  changed = true;
+	}
+
+      loop_nest.release ();
+      free_dependence_relations (dependences);
+      free_data_refs (datarefs);
+    }
+
+  if (changed)
+    {
+      scev_reset ();
+      free_dominance_info (CDI_DOMINATORS);
+      return TODO_cleanup_cfg;
+    }
+  return 0;
+}
+
+/* Pass boilerplate.  */
+
+namespace {
+
+const pass_data pass_data_loop_jam =
+{
+  GIMPLE_PASS, /* type.  */
+  "unrolljam", /* name.  */
+  OPTGROUP_LOOP, /* optinfo_flags.  */
+  TV_LOOP_JAM, /* tv_id.  */
+  PROP_cfg, /* properties_required.  */
+  0, /* properties_provided.  */
+  0, /* properties_destroyed.  */
+  0, /* todo_flags_start.  */
+  0, /* todo_flags_finish.  */
+};
+
+class pass_loop_jam : public gimple_opt_pass
+{
+public:
+  pass_loop_jam (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_loop_jam, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+  {
+    return flag_unroll_jam != 0;
+  }
+  virtual unsigned int execute (function *);
+
+};
+
+unsigned int
+pass_loop_jam::execute (function *fun)
+{
+  if (number_of_loops (fun) <= 1)
+    return 0;
+
+  return tree_loop_unroll_and_jam ();
+}
+
+}
+
+gimple_opt_pass *
+make_pass_loop_jam (gcc::context *ctxt)
+{
+  return new pass_loop_jam (ctxt);
+}
+
diff -N -urp a/gcc/opts.c b/gcc/opts.c
--- a/gcc/opts.c	2018-11-07 11:37:24.891223860 +0800
+++ b/gcc/opts.c	2018-11-07 11:38:26.171223860 +0800
@@ -534,6 +534,7 @@ static const struct default_options defa
     { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_finline_functions_called_once, NULL, 1 },
     { OPT_LEVELS_3_PLUS, OPT_fsplit_loops, NULL, 1 },
     { OPT_LEVELS_3_PLUS, OPT_funswitch_loops, NULL, 1 },
+    { OPT_LEVELS_3_PLUS, OPT_floop_unroll_and_jam, NULL, 1 },
     { OPT_LEVELS_3_PLUS, OPT_fgcse_after_reload, NULL, 1 },
     { OPT_LEVELS_3_PLUS, OPT_ftree_loop_vectorize, NULL, 1 },
     { OPT_LEVELS_3_PLUS, OPT_ftree_slp_vectorize, NULL, 1 },
diff -N -urp a/gcc/params.def b/gcc/params.def
--- a/gcc/params.def	2018-11-07 11:37:27.543223860 +0800
+++ b/gcc/params.def	2018-11-07 11:38:26.171223860 +0800
@@ -1280,6 +1280,16 @@ DEFPARAM (PARAM_VECT_EPILOGUES_NOMASK,
 	  "Enable loop epilogue vectorization using smaller vector size.",
 	  0, 0, 1)

+DEFPARAM (PARAM_UNROLL_JAM_MIN_PERCENT,
+	 "unroll-jam-min-percent",
+	 "Minimum percentage of memrefs that must go away for unroll-and-jam to be considered profitable.",
+	 1, 0, 100)
+
+DEFPARAM (PARAM_UNROLL_JAM_MAX_UNROLL,
+	 "unroll-jam-max-unroll",
+	 "Maximum unroll factor for the unroll-and-jam transformation.",
+	 4, 0, 0)
+
 /*

 Local variables:
diff -N -urp a/gcc/passes.def b/gcc/passes.def
--- a/gcc/passes.def	2018-11-07 11:37:24.859223860 +0800
+++ b/gcc/passes.def	2018-11-07 11:38:26.171223860 +0800
@@ -272,6 +272,7 @@ along with GCC; see the file COPYING3.
 	  NEXT_PASS (pass_tree_unswitch);
 	  NEXT_PASS (pass_scev_cprop);
 	  NEXT_PASS (pass_loop_split);
+	  NEXT_PASS (pass_loop_jam);
 	  /* All unswitching, final value replacement and splitting can expose
 	     empty loops.  Remove them now.  */
 	  NEXT_PASS (pass_cd_dce);
diff -N -urp a/gcc/timevar.def b/gcc/timevar.def
--- a/gcc/timevar.def	2018-11-07 11:37:24.935223860 +0800
+++ b/gcc/timevar.def	2018-11-07 11:38:26.175223860 +0800
@@ -186,6 +186,7 @@ DEFTIMEVAR (TV_TREE_LOOP_IVCANON     , "
 DEFTIMEVAR (TV_SCEV_CONST            , "scev constant prop")
 DEFTIMEVAR (TV_TREE_LOOP_UNSWITCH    , "tree loop unswitching")
 DEFTIMEVAR (TV_LOOP_SPLIT            , "loop splitting")
+DEFTIMEVAR (TV_LOOP_JAM		     , "unroll and jam")
 DEFTIMEVAR (TV_COMPLETE_UNROLL       , "complete unrolling")
 DEFTIMEVAR (TV_TREE_PARALLELIZE_LOOPS, "tree parallelize loops")
 DEFTIMEVAR (TV_TREE_VECTORIZATION    , "tree vectorization")
diff -N -urp a/gcc/tree-pass.h b/gcc/tree-pass.h
--- a/gcc/tree-pass.h	2018-11-07 11:37:24.887223860 +0800
+++ b/gcc/tree-pass.h	2018-11-07 11:38:26.175223860 +0800
@@ -369,6 +369,7 @@ extern gimple_opt_pass *make_pass_tree_l
 extern gimple_opt_pass *make_pass_lim (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_tree_unswitch (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_loop_split (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_loop_jam (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_predcom (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_iv_canon (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_scev_cprop (gcc::context *ctxt);