This backport contains 128 patchs from gcc main stream tree. 
The commit id of these patchs list as following in the order of time.

0001-Aarch64-SVE-Dot-product-support.patch
9feeafd7f95ea9f7211908c137c60074b3a52da2

0002-tree-vect-stmts.c-get_group_load_store_type-Avoid-pe.patch
419c5f99876d9ee517f6b646dd785cdcaf5cb6fe

0003-re-PR-tree-optimization-90358-526.blender_r-train-ru.patch
898758504fa87d9f5e72c2c8b32139b413276a10

0004-tree-vect-slp.c-vect_build_slp_tree_2-Bump-size-when.patch
9f708a844853eb2fe87e696d27de14cbd68896f8

0005-cfgloop.h-struct-loop-Add-simdlen-member.patch
f63445e56c265757ebd50dc12fcd01773341b49f

0006-Current-vectoriser-doesn-t-support-masked-loads-for-.patch
997636716c5dde7d59d026726a6f58918069f122

0007-tree-vrp.h-value_range_base-nonzero_p-New.patch
f2b00d2ba461d6dafdeccf6d93828b349b5e7f76

0008-AArch64-PR-tree-optimization-90332-Implement-vec_ini.patch
41dab855dce20d5d7042c9330dd8124d0ece19c0

0009-Fix-a-thinko-in-tree-ssa-loop.c.patch
cc261f66c268107b120add99942d729b3a489452

0010-re-PR-tree-optimization-90883-Generated-code-is-wors.patch
3fe0ddc88334f9afd622458653a6d103948994bd

0011-re-PR-tree-optimization-90883-Generated-code-is-wors.patch
08c1638dab9becfafc65064891c1c59f5711c27f

0012-Remove-quite-obvious-dead-assignments.patch
45309d286c80ecad8b7a4efba0e9aba35d847af6

0013-Fix-various-issues-seen-with-clang-static-analyzer.patch
ef874db611879d5004e1d834543e55d31f2bfe1c

0014-re-PR-tree-optimization-91033-ICE-in-vect_analyze_lo.patch
a7b3509eb6aa51d696be5edba6f4e451ceff03a0

0015-re-PR-tree-optimization-91069-Miscompare-of-453.povr.patch
75da268e1a563a1a52389cd2ecee12d07c45a655

0016-tree-vrp.c-extract_range_from_multiplicative_op-Add-.patch
e2cfa983c31fa7886f496a47feb8714297ca0063

0017-re-PR-tree-optimization-91257-Compile-time-and-memor.patch
a55d6091230ae8d0d6f6c20dcc55158f6705090e

0018-re-PR-tree-optimization-91257-Compile-time-and-memor.patch
ce52e0ffb4f1ea7bd4fb99aea5dda75d260e438f

0019-Enforce-canonicalization-in-value_range.patch:
c7cf3a9bb00b6d64ba0c0e0761f000758e9428a6

0020-tree-vectorizer.h-get_initial_def_for_reduction-Remo.patch
5fdd6038147e4ba30c8c01332dae8ab0d717bc14

0021-tree-parloops.c-report_ploop_op-Copy-from-report_vec.patch
31de92e39bbeffb9f1641d292e94b48f70809ae1

0022-tree-vect-loop.c-vect_is_simple_reduction-Remove-ope.patch
901083b9bdf69a7b1382f9682c6fd1d5759667dd

0023-Enforce-correct-COND_EXPR-order-for-EXTRACT_LAST_RED.patch
c449d3ae28ff4e133114fb67dbf7dcc7a95ca5d5

0024-tree-vect-loop.c-vect_is_slp_reduction-Remove.patch
b3c4d0dd309b7027f6e0f0b9a84829fcd53f7d64

0025-re-PR-tree-optimization-91822-FAIL-gcc.dg-pr88031.c-.patch
6e222b2a3aede20f3093802d1649e75848e3bd2b

0026-re-PR-target-91269-unaligned-floating-point-register.patch
d63eadac7db10d4846bdffa93fd164cb035fb102

0027-tree-vect-loop.c-get_initial_def_for_reduction-Simpl.patch
d469a71e5a0eb512b522248841c56496abca8cd6

0028-tree-vectorizer.h-_stmt_vec_info-const_cond_reduc_co.patch
a7701dd16103048432ec8051e4773760c0e2cf90

0029-re-PR-tree-optimization-91896-ICE-in-vect_get_vec_de.patch
fadb01364d36a50836201bc9a6a03e525d267967

0030-tree-vect-loop.c-vect_analyze_loop_operations-Also-c.patch
9593e8e5e391e77bb065d4689b7511bed6a640a3

0031-tree-vect-loop.c-vect_analyze_loop_operations-Analyz.patch
1b4dbccc1f828fa00e6acc8b88d24301c65552df

0032-Fix-reduc_index-1-handling-for-COND_REDUCTION-PR9190.patch
18908a56e18f15f84a91a4529923dd0878b2294f

0033-tree-vectorizer.h-_stmt_vec_info-reduc_fn-New.patch
29f26978866f32bddd656847441a3a953ffd7a21

0034-gimple.c-gimple_get_lhs-For-PHIs-return-the-result.patch
61362d9d18916bd5b694385982cf4a02b7537b0e

0035-tree-vect-loop.c-vectorizable_reduction-Move-variabl.patch
c7ea76ea5629e9f0357de49847274cf80e35f2f8

0036-tree-if-conv.c-tree_if_conversion-Move-call-to-ifcvt.patch
f30b3d2891cef9803badb3f85d739c0fcfafd585

0037-tree-vectorizer.h-stmt_vec_info_type-cycle_phi_info_.patch
291fa23ac04e317877c1e102937532f080180bb2

0038-re-PR-tree-optimization-91940-__builtin_bswap16-loop.patch
9ff9a0a5e6edd8729f559bf86ca06f781c4da246

0039-tree-vectorizer.h-vect_transform_reduction-Declare.patch
9f4d9a366b3299c276043ab987234c7bed7d29f2

0040-re-PR-target-91982-gcc.target-aarch64-sve-clastb_-.c.patch
48528394eafa9d1db9f956570f910c76d429a3e5

0041-re-PR-tree-optimization-91532-SVE-Redundant-predicat.patch
b238b34ea47222ffca7addc5fe4e8c052ade88b3

0042-tree-vectorizer.h-_stmt_vec_info-v_reduc_type-Remove.patch
69f8c1aef5cdcc54d5cb2ca4f99f4f26c2f822a9

0043-tree-vectorizer.h-_stmt_vec_info-reduc_vectype_in-Ne.patch
f78347996e02a8a767a525bfb764e769afe29d67

0044-tree-vect-loop.c-vect_is_simple_reduction-Simplify-a.patch
4a8841c0413d52261a8d024577381582d07a866a

0045-re-PR-tree-optimization-92069-ice-in-vect_analyze_sc.patch
7bd8bec53f0e43c7a7852c54650746e65324514b

0046-Deal-with-incoming-POLY_INT_CST-ranges-PR92033.patch
96eb7d7a642085f651e9940f0ee75568d7c4441d

0047-tree-vect-loop.c-vect_valid_reduction_input_p-Remove.patch
aab8c2fd6542a52663243eec160b80bdd61516d5

0048-tree-vect-loop.c-needs_fold_left_reduction_p-Export.patch
aa9dffac731d0359a0e7a925ff8f4a1bef182eac

0049-vect-Refactor-versioning-threshold.patch
a421fe9e610b5dbfce1913cd724c8ba193addd47

0050-vect-Outline-code-into-new-function-determine_peel_f.patch
31b35fd503e1c6713839db24044812d237aba5f1

0051-vect-Be-consistent-in-versioning-threshold-use.patch
f261d4808cc28a2dfd47fe06c97364c0869bb78f

0052-tree-vect-loop.c-check_reduction_path-Compute-reduct.patch
58baf7ab85cbb1068a651c96f7d56e2902ead6cc

0053-tree-vectorizer.h-_stmt_vec_info-cond_reduc_code-Rem.patch
c11cccc0285f02f117a1e80924fb7673b6486ce9

0054-re-PR-target-86753-gcc.target-aarch64-sve-vcond_-45-.patch
cc1facefe3b4e3b067d95291a7dba834b830ff18

0055-Avoid-recomputing-data-references-in-BB-SLP.patch
fa0c8df71d4f0476834db0b7cd88524878b46cf7

0056-Move-code-out-of-vect_slp_analyze_bb_1.patch
1d778697b37aec23db5b6003dfe08d2d78bd9424

0057-Avoid-setting-current_vector_size-in-get_vec_alignme.patch
da157e2ee9e12348df78246ee33b244b7cc334df

0058-Pass-a-vec_info-to-vect_supportable_shift.patch
a5c3185a503fbdbc1bf05efe8ab9d12850a211c1

0059-Pass-a-vec_info-to-vect_supportable_direct_optab_p.patch
dcab2a0d1d4b2c0b4bba6f5e3834ec0678a2a5c8

0060-Pass-a-vec_info-to-get_mask_type_for_scalar_type.patch
1bd5196c9b1a0cd7280adadd6d788f81a82ca023

0061-Pass-a-vec_info-to-get_vectype_for_scalar_type.patch
7ed54790da87bbb4a134020a9fb8bd1b72fd0acb

0062-Pass-a-vec_info-to-duplicate_and_interleave.patch
cdbe6e9bb4ae2882f77f94993783085fa342a9f9

0063-Pass-a-vec_info-to-can_duplicate_and_interleave_p.patch
43fdde5738ea0554fa000987e9769add027f4876

0064-Pass-a-vec_info-to-simple_integer_narrowing.patch
6c261c667801eee46a6221d3681d17493c0bbd65

0065-Pass-a-vec_info-to-supportable_narrowing_operation.patch
db8374a63fd0ea84f72ac76cc899be44df36df6a

0066-Pass-a-loop_vec_info-to-vect_maybe_permute_loop_mask.patch
b0dab10e71b03441beefbbf951c0812056413cd3

0067-Pass-a-vec_info-to-vect_halve_mask_nunits.patch
830e90dab3dee5c8129c7760ff09ab112c2cd271

0068-Pass-a-vec_info-to-vect_double_mask_nunits.patch
8d1473958808fe4714ec24991ac83ee6cbf45397

0069-Replace-current_vector_size-with-vec_info-vector_siz.patch
ba7f76dd6bbf038948bbe516764a8bb0c851f750

0070-tree-vectorizer.h-_slp_tree-ops-New-member.patch
30c0d1e3cf8b03992e08cfd00ccf1fcb638d3c03

0071-re-PR-tree-optimization-92162-ICE-in-vect_create_epi.patch
53b15ca96116544a7a3ca8bc5f4e1649b74f3d45

0072-Fix-use-after-free-in-vector_size-change.patch
87121696fb2ddbec5f33daa359234850f7fd306d

0073-re-PR-tree-optimization-92173-ICE-in-optab_for_tree_.patch
9107d6526b938eba8168025c0d90d06ad3634e69

0074-re-PR-tree-optimization-92173-ICE-in-optab_for_tree_.patch
6c7b0df8029d01e05577668333660d0bc58a3023

0075-AArch64-Don-t-apply-mode_for_int_vector-to-scalars.patch
d7814449f229cecdee48afe381519a61ea7e3378

0076-re-PR-tree-optimization-65930-Reduction-with-sign-ch.patch
82e8e335f917b9ce40801838c06f7945cf88da43

0077-re-PR-tree-optimization-92205-ICE-in-vect_get_vec_de.patch
e227594789d909fbad56f6036910938678738f92

0078-tree-vect-slp.c-vect_get_and_check_slp_defs-For-redu.patch
4352288a3df915575a2b820f702242908740106f

0079-tree-vect-loop.c-vectorizable_reduction-Verify-STMT_.patch
ea133b14f48ed5730748a7e02e322fb07ccc2d85

0080-Fix-reductions-for-fully-masked-loops.patch
89d0345ad7b8d84045813972ee60557a6b511c57

0081-tree-vect-loop.c-vect_create_epilog_for_reduction-Us.patch
e0c4f7fbd6a4ee8e3a1468514044bd941fa28522

0082-re-PR-tree-optimization-92241-ice-in-vect_mark_patte.patch
97c6bea819ec0a773041308e62a7c05c33f093b0

0083-re-PR-tree-optimization-65930-Reduction-with-sign-ch.patch
b7ff7cef5005721e78d6936bed3ae1c059b4e8d2

0084-Fix-reduc_index-calculation-in-vectorizable_conditio.patch
1d149b7260bcc4c0c6367b3aea47a8b91a1cf345

0085-vect-PR-88915-Vectorize-epilogues-when-versioning-lo.patch
97c146036750e7cb3966d292572ec158a78f356e

0086-re-PR-tree-optimization-65930-Reduction-with-sign-ch.patch
b4673569c2a8b974e3f84ffaa547941c5d40cfe5

0087-Come-up-with-an-abstraction.patch
7f4a8ee03d404c560dcb75ba684fd57ffbc77e85

0088-re-PR-tree-optimization-92275-ICE-error-definition-i.patch
b81f2dafdbd2c5aa49213b35dc12d4610834e39e

0089-vect-Make-vect-epilogues-nomask-1-default.patch
1297712fb4af6c6bfd827e0f0a9695b14669f87d

0090-vect-Clean-up-orig_loop_vinfo-from-vect_analyze_loop.patch
494d6c28c53d0852bb6468b1f1ca189159775fcc

0091-re-PR-tree-optimization-92371-ICE-in-info_for_reduct.patch
02bf7e6fa219f939b3225c54fbe8bab2133b1aeb

0092-vect-PR92317-fix-skip_epilogue-creation-for-epilogue.patch
2e7a4f579b1157754ea20a03431b4fa80cd4567a

0093-Restructure-vect_analyze_loop.patch
72d6aeecd95ec49fff1d258e4631167a03351cbb

0094-Check-the-VF-is-small-enough-for-an-epilogue-loop.patch
8ec5b16a9a3dbd6d825596c22f1bc32646de28fe

0095-tree-vect-loop.c-vectorizable_reduction-Remember-red.patch
06af1f1a0def9de076ec629ea634122f15882ce6

0096-Don-t-vectorise-single-iteration-epilogues.patch
4b205bf82d06c4d9d0ae7b78e54c712d79d5b021

0097-re-PR-tree-optimization-92405-ICE-in-vect_get_vec_de.patch
084d390246c2172853f9e12ce04aef23cba79590

0098-re-PR-tree-optimization-92324-ICE-in-expand_direct_o.patch
f1e1ed3314b7c6308f64cbbcf6d1916e239c8e35

0099-vect-Disable-vectorization-of-epilogues-for-loops-wi.patch
b602712b3ea2a0729a2eda61bd9ee795aba6138f

0100-Use-correct-vector-type-in-neutral_op_for_slp_reduct.patch
d308ca27c71e43625b378dc6c2774105867d4fa7

0101-vect-Account-for-epilogue-s-peeling-for-gaps-when-ch.patch
87b47251924c7539a9a8e191587d118a14496473

0102-Add-a-targetm.vectorize.related_mode-hook.patch
f09552335030433018fd5f7f6b9848339b5ca2da

0103-Replace-mode_for_int_vector-with-related_int_vector_.patch
d083ee47a9828236016841356fc7207e7c90bbbd

0104-Add-build_truth_vector_type_for_mode.patch
0a0ef2387cc1561d537d8d949aef9479ef17ba35

0105-Remove-build_-same_sized_-truth_vector_type.patch
e8738f4e9686203451fd11f05b268b8a31b95ebd

0106-Pass-the-data-vector-mode-to-get_mask_mode.patch
10116ec1c147a76522cafba6b6a5b4ed1cb37b77

0107-Use-build_vector_type_for_mode-in-get_vectype_for_sc.patch
95da266b86fcdeff84fcadc5e3cde3d0027e571d

0108-Use-consistent-compatibility-checks-in-vectorizable_.patch
0203c4f3bfb3e3242635b0cee0b9deedb4070a62

0109-Use-consistent-compatibility-checks-in-vectorizable_.patch
e021fb865564b62a10adb1e98f75b5ea05058047

0110-Replace-vec_info-vector_size-with-vec_info-vector_mo.patch
1c84a2d25ecd4c03dde745f36a4762dd45f97c85

0111-Make-less-use-of-get_same_sized_vectype.patch
2df4150075c03f8a292c40afd3bb25febb673578

0112-Require-equal-type-sizes-for-vectorised-calls.patch
7f52eb891b738337d5cf82c7c440a5eea8c7b0c9

0113-Support-vectorisation-with-mixed-vector-sizes.patch
df7c22831f1e48dba49479c5960c1c180d8eab2c

0114-Avoid-retrying-with-the-same-vector-modes.patch
a55d8232df3dd4f7a3f5b70025074c3919b802a6

0115-AArch64-Support-vectorising-with-multiple-vector-siz.patch
74166aabeb7f22990476b1169bba031b8323ee92

0116-Allow-mixed-vector-sizes-within-a-single-vectorised-.patch
05101d1b575a57ca26e4275e971da85a0dd1d52a

0117-Vectorise-conversions-between-differently-sized-inte.patch
9c437a108a14b9bdc44659c131b0da944e5ffeab

0118-Consider-building-nodes-from-scalars-in-vect_slp_ana.patch
60838d634634a70d65a126166c944b159ac7649c

0119-Optionally-pick-the-cheapest-loop_vec_info.patch
bcc7e346bf9b5dc77797ea949d6adc740deb30ca

0120-Move-canonicalisation-of-dr_with_seg_len_pair_ts.patch
1fb2b0f69ee849142b669ba1b82264ce6d0f75f9

0121-Delay-swapping-data-refs-in-prune_runtime_alias_test.patch
97602450b04e94aff034381bf6ee4236b95727ed

0122-Add-flags-to-dr_with_seg_len_pair_t.patch
e9acf80c96d681917d930869b7cbfb7d2fa54d51

0123-Record-whether-a-dr_with_seg_len-contains-mixed-step.patch
52c29905259363ce2b78dd7aa8a25cf531cddb3a

0124-Dump-the-list-of-merged-alias-pairs.patch
cad984b289e2b3aca786314c673339eb0500fefa

0125-Print-the-type-of-alias-check-in-a-dump-message.patch
b4d1b635737a4780e5be247f8be9550eaf83dae5

0126-Use-a-single-comparison-for-index-based-alias-checks.patch
f9d6338bd15ce1fae36bf25d3a0545e9678ddc58

0127-Optimise-WAR-and-WAW-alias-checks.patch
8489e1f45b50600c01eb8ed8c5d0ca914ded281c

0128-Avoid-quadratic-behaviour-in-prune_runtime_alias_tes.patch
ea1ff9e46c7ec5e49ec671616cfcf405ef665054

diff --git a/gcc/asan.c b/gcc/asan.c
index 3b800b26b69..605d04f87f7 100644
--- a/gcc/asan.c
+++ b/gcc/asan.c
@@ -1713,8 +1713,8 @@ asan_emit_allocas_unpoison (rtx top, rtx bot, rtx_insn *before)
   rtx ret = init_one_libfunc ("__asan_allocas_unpoison");
   top = convert_memory_address (ptr_mode, top);
   bot = convert_memory_address (ptr_mode, bot);
-  ret = emit_library_call_value (ret, NULL_RTX, LCT_NORMAL, ptr_mode,
-				 top, ptr_mode, bot, ptr_mode);
+  emit_library_call (ret, LCT_NORMAL, ptr_mode,
+		     top, ptr_mode, bot, ptr_mode);
 
   do_pending_stack_adjust ();
   rtx_insn *insns = get_insns ();
diff --git a/gcc/bt-load.c b/gcc/bt-load.c
index a7d9d53954e..f68879ca49a 100644
--- a/gcc/bt-load.c
+++ b/gcc/bt-load.c
@@ -1169,7 +1169,6 @@ move_btr_def (basic_block new_def_bb, int btr, btr_def *def, bitmap live_range,
 
   if (def->other_btr_uses_before_def)
     {
-      insp = BB_END (b);
       for (insp = BB_END (b); ! INSN_P (insp); insp = PREV_INSN (insp))
 	gcc_assert (insp != BB_HEAD (b));
 
diff --git a/gcc/builtins.c b/gcc/builtins.c
index ed11f79ff0b..910e614a4d1 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -1653,11 +1653,8 @@ expand_builtin_apply_args_1 (void)
   /* Save the structure value address unless this is passed as an
      "invisible" first argument.  */
   if (struct_incoming_value)
-    {
-      emit_move_insn (adjust_address (registers, Pmode, size),
-		      copy_to_reg (struct_incoming_value));
-      size += GET_MODE_SIZE (Pmode);
-    }
+    emit_move_insn (adjust_address (registers, Pmode, size),
+		    copy_to_reg (struct_incoming_value));
 
   /* Return the address of the block.  */
   return copy_addr_to_reg (XEXP (registers, 0));
@@ -1806,7 +1803,6 @@ expand_builtin_apply (rtx function, rtx arguments, rtx argsize)
       emit_move_insn (struct_value, value);
       if (REG_P (struct_value))
 	use_reg (&call_fusage, struct_value);
-      size += GET_MODE_SIZE (Pmode);
     }
 
   /* All arguments and registers used for the call are set up by now!  */
diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c
index c0582a54c93..cb999cbf82f 100644
--- a/gcc/c/c-typeck.c
+++ b/gcc/c/c-typeck.c
@@ -5424,7 +5424,7 @@ build_conditional_expr (location_t colon_loc, tree ifexp, bool ifexp_bcp,
       tree elem_type = TREE_TYPE (vectype);
       tree zero = build_int_cst (elem_type, 0);
       tree zero_vec = build_vector_from_val (vectype, zero);
-      tree cmp_type = build_same_sized_truth_vector_type (vectype);
+      tree cmp_type = truth_type_for (vectype);
       ifexp = build2 (NE_EXPR, cmp_type, ifexp, zero_vec);
     }
 
@@ -11327,7 +11327,7 @@ build_vec_cmp (tree_code code, tree type,
 {
   tree zero_vec = build_zero_cst (type);
   tree minus_one_vec = build_minus_one_cst (type);
-  tree cmp_type = build_same_sized_truth_vector_type (type);
+  tree cmp_type = truth_type_for (type);
   tree cmp = build2 (code, cmp_type, arg0, arg1);
   return build3 (VEC_COND_EXPR, type, cmp, minus_one_vec, zero_vec);
 }
diff --git a/gcc/cfgexpand.c b/gcc/cfgexpand.c
index e252975f546..4ae8e3b3297 100644
--- a/gcc/cfgexpand.c
+++ b/gcc/cfgexpand.c
@@ -3029,7 +3029,6 @@ expand_asm_stmt (gasm *stmt)
 	      }
 	}
     }
-  unsigned nclobbers = clobber_rvec.length();
 
   /* First pass over inputs and outputs checks validity and sets
      mark_addressable if needed.  */
@@ -3301,7 +3300,7 @@ expand_asm_stmt (gasm *stmt)
   gcc_assert (constraints.length() == noutputs + ninputs);
 
   /* But it certainly can adjust the clobbers.  */
-  nclobbers = clobber_rvec.length();
+  unsigned nclobbers = clobber_rvec.length ();
 
   /* Third pass checks for easy conflicts.  */
   /* ??? Why are we doing this on trees instead of rtx.  */
@@ -5979,11 +5978,11 @@ construct_init_block (void)
     {
       first_block = e->dest;
       redirect_edge_succ (e, init_block);
-      e = make_single_succ_edge (init_block, first_block, flags);
+      make_single_succ_edge (init_block, first_block, flags);
     }
   else
-    e = make_single_succ_edge (init_block, EXIT_BLOCK_PTR_FOR_FN (cfun),
-			       EDGE_FALLTHRU);
+    make_single_succ_edge (init_block, EXIT_BLOCK_PTR_FOR_FN (cfun),
+			   EDGE_FALLTHRU);
 
   update_bb_for_insn (init_block);
   return init_block;
diff --git a/gcc/cfghooks.c b/gcc/cfghooks.c
index a1d603a207e..a18b6490bdd 100644
--- a/gcc/cfghooks.c
+++ b/gcc/cfghooks.c
@@ -253,8 +253,6 @@ verify_flow_info (void)
 	err = 1;
       }
 
-  last_bb_seen = ENTRY_BLOCK_PTR_FOR_FN (cfun);
-
   /* Clean up.  */
   free (last_visited);
   free (edge_checksum);
diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
index b78d87d22f1..98bf6d2adda 100644
--- a/gcc/cfgloop.h
+++ b/gcc/cfgloop.h
@@ -174,6 +174,9 @@ struct GTY ((chain_next ("%h.next"))) loop {
      of the loop can be safely evaluated concurrently.  */
   int safelen;
 
+  /* Preferred vectorization factor for the loop if non-zero.  */
+  int simdlen;
+
   /* Constraints are generally set by consumers and affect certain
      semantics of niter analyzer APIs.  Currently the APIs affected are
      number_of_iterations_exit* functions and their callers.  One typical
diff --git a/gcc/cfgloopmanip.c b/gcc/cfgloopmanip.c
index ea4b914c15b..8fc697ecf5d 100644
--- a/gcc/cfgloopmanip.c
+++ b/gcc/cfgloopmanip.c
@@ -364,7 +364,6 @@ remove_path (edge e, bool *irred_invalidated,
 
   for (i = 0; i < nrem; i++)
     {
-      bb = rem_bbs[i];
       FOR_EACH_EDGE (ae, ei, rem_bbs[i]->succs)
 	if (ae->dest != EXIT_BLOCK_PTR_FOR_FN (cfun)
 	    && !bitmap_bit_p (seen, ae->dest->index))
@@ -1016,6 +1015,7 @@ copy_loop_info (struct loop *loop, struct loop *target)
   target->nb_iterations_estimate = loop->nb_iterations_estimate;
   target->estimate_state = loop->estimate_state;
   target->safelen = loop->safelen;
+  target->simdlen = loop->simdlen;
   target->constraints = loop->constraints;
   target->can_be_parallel = loop->can_be_parallel;
   target->warned_aggressive_loop_optimizations
diff --git a/gcc/cfgrtl.c b/gcc/cfgrtl.c
index 08e534f2485..b5f15907bde 100644
--- a/gcc/cfgrtl.c
+++ b/gcc/cfgrtl.c
@@ -2958,7 +2958,6 @@ rtl_verify_bb_layout (void)
   basic_block last_bb_seen = ENTRY_BLOCK_PTR_FOR_FN (cfun), curr_bb = NULL;
 
   num_bb_notes = 0;
-  last_bb_seen = ENTRY_BLOCK_PTR_FOR_FN (cfun);
 
   for (x = rtx_first; x; x = NEXT_INSN (x))
     {
diff --git a/gcc/cgraph.c b/gcc/cgraph.c
index a16f4668b3c..bed6838d22b 100644
--- a/gcc/cgraph.c
+++ b/gcc/cgraph.c
@@ -2717,8 +2717,6 @@ bool
 cgraph_node::set_pure_flag (bool pure, bool looping)
 {
   struct set_pure_flag_info info = {pure, looping, false};
-  if (!pure)
-    looping = false;
   call_for_symbol_thunks_and_aliases (set_pure_flag_1, &info, !pure, true);
   return info.changed;
 }
diff --git a/gcc/combine.c b/gcc/combine.c
index 567aa2c3715..b9d674c96cc 100644
--- a/gcc/combine.c
+++ b/gcc/combine.c
@@ -6591,7 +6591,6 @@ simplify_if_then_else (rtx x)
 	  || reg_mentioned_p (true_rtx, false_rtx)
 	  || rtx_equal_p (false_rtx, XEXP (cond, 0))))
     {
-      true_code = reversed_comparison_code (cond, NULL);
       SUBST (XEXP (x, 0), reversed_comparison (cond, GET_MODE (cond)));
       SUBST (XEXP (x, 1), false_rtx);
       SUBST (XEXP (x, 2), true_rtx);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index e3852c5d182..28f93a70801 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3183,7 +3183,7 @@
 ;; In this insn, operand 1 should be low, and operand 2 the high part of the
 ;; dest vector.
 
-(define_insn "*aarch64_combinez<mode>"
+(define_insn "@aarch64_combinez<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
 	(vec_concat:<VDBL>
 	  (match_operand:VDC 1 "general_operand" "w,?r,m")
@@ -3197,7 +3197,7 @@
    (set_attr "arch" "simd,fp,simd")]
 )
 
-(define_insn "*aarch64_combinez_be<mode>"
+(define_insn "@aarch64_combinez_be<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
         (vec_concat:<VDBL>
 	  (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero")
@@ -5926,6 +5926,15 @@
   DONE;
 })
 
+(define_expand "vec_init<mode><Vhalf>"
+  [(match_operand:VQ_NO2E 0 "register_operand" "")
+   (match_operand 1 "" "")]
+  "TARGET_SIMD"
+{
+  aarch64_expand_vector_init (operands[0], operands[1]);
+  DONE;
+})
+
 (define_insn "*aarch64_simd_ld1r<mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
 	(vec_duplicate:VALL_F16
@@ -6937,3 +6946,21 @@
   "pmull2\\t%0.1q, %1.2d, %2.2d"
   [(set_attr "type" "crypto_pmull")]
 )
+
+;; Sign- or zero-extend a 64-bit integer vector to a 128-bit vector.
+(define_insn "<optab><Vnarrowq><mode>2"
+  [(set (match_operand:VQN 0 "register_operand" "=w")
+	(ANY_EXTEND:VQN (match_operand:<VNARROWQ> 1 "register_operand" "w")))]
+  "TARGET_SIMD"
+  "<su>xtl\t%0.<Vtype>, %1.<Vntype>"
+  [(set_attr "type" "neon_shift_imm_long")]
+)
+
+;; Truncate a 128-bit integer vector to a 64-bit vector.
+(define_insn "trunc<mode><Vnarrowq>2"
+  [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
+	(truncate:<VNARROWQ> (match_operand:VQN 1 "register_operand" "w")))]
+  "TARGET_SIMD"
+  "xtn\t%0.<Vntype>, %1.<Vtype>"
+  [(set_attr "type" "neon_shift_imm_narrow_q")]
+)
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 3f39c4c5b63..02d33b7276f 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3132,3 +3132,19 @@
     DONE;
   }
 )
+
+;; Unpredicated DOT product.
+(define_insn "<sur>dot_prod<vsi2qi>"
+  [(set (match_operand:SVE_SDI 0 "register_operand" "=w, ?&w")
+	(plus:SVE_SDI
+	  (unspec:SVE_SDI
+	    [(match_operand:<VSI2QI> 1 "register_operand" "w, w")
+	     (match_operand:<VSI2QI> 2 "register_operand" "w, w")]
+	    DOTPROD)
+	  (match_operand:SVE_SDI 3 "register_operand" "0, w")))]
+  "TARGET_SVE"
+  "@
+   <sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>
+   movprfx\t%0, %3\;<sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>"
+  [(set_attr "movprfx" "*,yes")]
+)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 2ff0bc0a686..128c250dffe 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1549,17 +1549,37 @@ aarch64_sve_pred_mode (unsigned int elem_nbytes)
 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
 
 static opt_machine_mode
-aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
+aarch64_get_mask_mode (machine_mode mode)
 {
-  if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
+  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+  if (vec_flags & VEC_SVE_DATA)
+    return aarch64_sve_pred_mode (GET_MODE_UNIT_SIZE (mode));
+
+  return default_get_mask_mode (mode);
+}
+
+/* Implement TARGET_VECTORIZE_RELATED_MODE.  */
+
+static opt_machine_mode
+aarch64_vectorize_related_mode (machine_mode vector_mode,
+				scalar_mode element_mode,
+				poly_uint64 nunits)
+{
+  unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
+
+  /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
+  if ((vec_flags & VEC_ADVSIMD)
+      && known_eq (nunits, 0U)
+      && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
+      && maybe_ge (GET_MODE_BITSIZE (element_mode)
+		   * GET_MODE_NUNITS (vector_mode), 128U))
     {
-      unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
-      machine_mode pred_mode;
-      if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
-	return pred_mode;
+      machine_mode res = aarch64_simd_container_mode (element_mode, 128);
+      if (VECTOR_MODE_P (res))
+	return res;
     }
 
-  return default_get_mask_mode (nunits, nbytes);
+  return default_vectorize_related_mode (vector_mode, element_mode, nunits);
 }
 
 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
@@ -10897,7 +10917,9 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
     /* Caller assumes we cannot fail.  */
     gcc_assert (use_rsqrt_p (mode));
 
-  machine_mode mmsk = mode_for_int_vector (mode).require ();
+  machine_mode mmsk = (VECTOR_MODE_P (mode)
+		       ? related_int_vector_mode (mode).require ()
+		       : int_mode_for_mode (mode).require ());
   rtx xmsk = gen_reg_rtx (mmsk);
   if (!recp)
     /* When calculating the approximate square root, compare the
@@ -14226,13 +14248,34 @@ aarch64_preferred_simd_mode (scalar_mode mode)
 
 /* Return a list of possible vector sizes for the vectorizer
    to iterate over.  */
-static void
-aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
+static unsigned int
+aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
 {
   if (TARGET_SVE)
-    sizes->safe_push (BYTES_PER_SVE_VECTOR);
-  sizes->safe_push (16);
-  sizes->safe_push (8);
+    modes->safe_push (VNx16QImode);
+
+  /* Try using 128-bit vectors for all element types.  */
+  modes->safe_push (V16QImode);
+
+  /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
+     for wider elements.  */
+  modes->safe_push (V8QImode);
+
+  /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
+     for wider elements.
+
+     TODO: We could support a limited form of V4QImode too, so that
+     we use 32-bit vectors for 8-bit elements.  */
+  modes->safe_push (V4HImode);
+
+  /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
+     for 64-bit elements.
+
+     TODO: We could similarly support limited forms of V2QImode and V2HImode
+     for this case.  */
+  modes->safe_push (V2SImode);
+
+  return 0;
 }
 
 /* Implement TARGET_MANGLE_TYPE.  */
@@ -15191,6 +15234,45 @@ aarch64_expand_vector_init (rtx target, rtx vals)
   rtx v0 = XVECEXP (vals, 0, 0);
   bool all_same = true;
 
+  /* This is a special vec_init<M><N> where N is not an element mode but a
+     vector mode with half the elements of M.  We expect to find two entries
+     of mode N in VALS and we must put their concatentation into TARGET.  */
+  if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
+    {
+      gcc_assert (known_eq (GET_MODE_SIZE (mode),
+		  2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
+      rtx lo = XVECEXP (vals, 0, 0);
+      rtx hi = XVECEXP (vals, 0, 1);
+      machine_mode narrow_mode = GET_MODE (lo);
+      gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
+      gcc_assert (narrow_mode == GET_MODE (hi));
+
+      /* When we want to concatenate a half-width vector with zeroes we can
+	 use the aarch64_combinez[_be] patterns.  Just make sure that the
+	 zeroes are in the right half.  */
+      if (BYTES_BIG_ENDIAN
+	  && aarch64_simd_imm_zero (lo, narrow_mode)
+	  && general_operand (hi, narrow_mode))
+	emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
+      else if (!BYTES_BIG_ENDIAN
+	       && aarch64_simd_imm_zero (hi, narrow_mode)
+	       && general_operand (lo, narrow_mode))
+	emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
+      else
+	{
+	  /* Else create the two half-width registers and combine them.  */
+	  if (!REG_P (lo))
+	    lo = force_reg (GET_MODE (lo), lo);
+	  if (!REG_P (hi))
+	    hi = force_reg (GET_MODE (hi), hi);
+
+	  if (BYTES_BIG_ENDIAN)
+	    std::swap (lo, hi);
+	  emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
+	}
+     return;
+   }
+
   /* Count the number of variable elements to initialise.  */
   for (int i = 0; i < n_elts; ++i)
     {
@@ -16684,7 +16766,7 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
   if (d->testing_p)
     return true;
 
-  machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
+  machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
   if (d->one_vector_p)
     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
@@ -17064,9 +17146,7 @@ void
 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
 			  rtx *ops)
 {
-  machine_mode pred_mode
-    = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
-			     GET_MODE_SIZE (cmp_mode)).require ();
+  machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
   rtx pred = gen_reg_rtx (pred_mode);
   if (FLOAT_MODE_P (cmp_mode))
     {
@@ -19363,9 +19443,9 @@ aarch64_libgcc_floating_mode_supported_p
 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
   aarch64_builtin_vectorized_function
 
-#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
-#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
-  aarch64_autovectorize_vector_sizes
+#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
+#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
+  aarch64_autovectorize_vector_modes
 
 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
@@ -19398,6 +19478,8 @@ aarch64_libgcc_floating_mode_supported_p
 #define TARGET_VECTORIZE_VEC_PERM_CONST \
   aarch64_vectorize_vec_perm_const
 
+#undef TARGET_VECTORIZE_RELATED_MODE
+#define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
 #undef TARGET_VECTORIZE_GET_MASK_MODE
 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 6caeeac8086..c7ccd5bf6fe 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -663,6 +663,9 @@
 			  (QI "b")   (HI "h")
 			  (SI "s")   (DI "d")])
 
+;; Like Vetype, but map to types that are a quarter of the element size.
+(define_mode_attr Vetype_fourth [(VNx4SI "b") (VNx2DI "h")])
+
 ;; Equivalent of "size" for a vector element.
 (define_mode_attr Vesize [(VNx16QI "b")
 			  (VNx8HI  "h") (VNx8HF  "h")
@@ -765,6 +768,7 @@
 ;; Half modes of all vector modes, in lower-case.
 (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
 			 (V4HI "v2hi")  (V8HI  "v4hi")
+			 (V8HF  "v4hf")
 			 (V2SI "si")    (V4SI  "v2si")
 			 (V2DI "di")    (V2SF  "sf")
 			 (V4SF "v2sf")  (V2DF  "df")])
@@ -800,6 +804,8 @@
 			    (V2DI "V2SI")
 			    (DI	  "SI")	  (SI	"HI")
 			    (HI	  "QI")])
+(define_mode_attr Vnarrowq [(V8HI "v8qi") (V4SI "v4hi")
+			    (V2DI "v2si")])
 
 ;; Narrowed quad-modes for VQN (Used for XTN2).
 (define_mode_attr VNARROWQ2 [(V8HI "V16QI") (V4SI "V8HI")
@@ -1029,8 +1035,10 @@
 		      (V2SF "p") (V4SF  "v")
 		      (V4HF "v") (V8HF  "v")])
 
-(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
-(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
+(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")
+			  (VNx4SI "vnx16qi") (VNx2DI "vnx8hi")])
+(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")
+			  (VNx4SI "VNx16QI") (VNx2DI "VNx8HI")])
 
 
 ;; Register suffix for DOTPROD input types from the return type.
diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
index f7ff95a0edf..325dd3cea9a 100644
--- a/gcc/config/arc/arc.c
+++ b/gcc/config/arc/arc.c
@@ -477,16 +477,17 @@ arc_preferred_simd_mode (scalar_mode mode)
 }
 
 /* Implements target hook
-   TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES.  */
+   TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES.  */
 
-static void
-arc_autovectorize_vector_sizes (vector_sizes *sizes)
+static unsigned int
+arc_autovectorize_vector_modes (vector_modes *modes, bool)
 {
   if (TARGET_PLUS_QMACW)
     {
-      sizes->quick_push (8);
-      sizes->quick_push (4);
+      modes->quick_push (V4HImode);
+      modes->quick_push (V2HImode);
     }
+  return 0;
 }
 
 
@@ -596,8 +597,8 @@ static rtx arc_legitimize_address_0 (rtx, rtx, machine_mode mode);
 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE arc_preferred_simd_mode
 
-#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
-#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES arc_autovectorize_vector_sizes
+#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
+#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES arc_autovectorize_vector_modes
 
 #undef TARGET_CAN_USE_DOLOOP_P
 #define TARGET_CAN_USE_DOLOOP_P arc_can_use_doloop_p
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index cdfc0f9e72f..1a4a4b7bc58 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -288,7 +288,7 @@ static bool arm_builtin_support_vector_misalignment (machine_mode mode,
 static void arm_conditional_register_usage (void);
 static enum flt_eval_method arm_excess_precision (enum excess_precision_type);
 static reg_class_t arm_preferred_rename_class (reg_class_t rclass);
-static void arm_autovectorize_vector_sizes (vector_sizes *);
+static unsigned int arm_autovectorize_vector_modes (vector_modes *, bool);
 static int arm_default_branch_cost (bool, bool);
 static int arm_cortex_a5_branch_cost (bool, bool);
 static int arm_cortex_m_branch_cost (bool, bool);
@@ -519,9 +519,9 @@ static const struct attribute_spec arm_attribute_table[] =
 #define TARGET_ARRAY_MODE_SUPPORTED_P arm_array_mode_supported_p
 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE arm_preferred_simd_mode
-#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
-#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
-  arm_autovectorize_vector_sizes
+#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
+#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
+  arm_autovectorize_vector_modes
 
 #undef  TARGET_MACHINE_DEPENDENT_REORG
 #define TARGET_MACHINE_DEPENDENT_REORG arm_reorg
@@ -28446,14 +28446,15 @@ arm_vector_alignment (const_tree type)
   return align;
 }
 
-static void
-arm_autovectorize_vector_sizes (vector_sizes *sizes)
+static unsigned int
+arm_autovectorize_vector_modes (vector_modes *modes, bool)
 {
   if (!TARGET_NEON_VECTORIZE_DOUBLE)
     {
-      sizes->safe_push (16);
-      sizes->safe_push (8);
+      modes->safe_push (V16QImode);
+      modes->safe_push (V8QImode);
     }
+  return 0;
 }
 
 static bool
diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c
index 99fa45edcd4..eb06ff9e05b 100644
--- a/gcc/config/gcn/gcn.c
+++ b/gcc/config/gcn/gcn.c
@@ -3800,8 +3800,7 @@ gcn_expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
    a vector.  */
 
 opt_machine_mode
-gcn_vectorize_get_mask_mode (poly_uint64 ARG_UNUSED (nunits),
-			     poly_uint64 ARG_UNUSED (length))
+gcn_vectorize_get_mask_mode (machine_mode)
 {
   /* GCN uses a DImode bit-mask.  */
   return DImode;
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 1bca5a7eea6..5a0f8a0eb72 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -9647,7 +9647,6 @@ ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
   CUMULATIVE_ARGS next_cum;
   tree fntype;
-  int max;
 
   gcc_assert (!no_rtl);
 
@@ -9663,10 +9662,6 @@ ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
   if (stdarg_p (fntype))
     ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
 			       true);
-
-  max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
-  if (max > X86_64_REGPARM_MAX)
-    max = X86_64_REGPARM_MAX;
 }
 
 
@@ -11806,7 +11801,6 @@ choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
 	    {
 	      base_reg = hard_frame_pointer_rtx;
 	      base_offset = toffset;
-	      len = tlen;
 	    }
 	}
     }
@@ -39699,12 +39693,10 @@ ix86_preferred_reload_class (rtx x, reg_class_t regclass)
 static reg_class_t
 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
 {
-  machine_mode mode = GET_MODE (x);
-
   /* Restrict the output reload class to the register bank that we are doing
      math on.  If we would like not to return a subset of CLASS, reject this
      alternative: if reload cannot do this, it will still use its choice.  */
-  mode = GET_MODE (x);
+  machine_mode mode = GET_MODE (x);
   if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
     return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
 
@@ -45666,14 +45658,13 @@ ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
 			       0, OPTAB_DIRECT);
 
   /* Compensate.  */
-  tmp = gen_reg_rtx (mode);
   /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
   tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
-  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
+  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
   xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
   /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
   tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
-  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
+  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
   xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
 
   /* res = copysign (xa2, operand1) */
@@ -50238,27 +50229,42 @@ ix86_split_reduction (machine_mode mode)
    vectors.  If AVX512F is enabled then try vectorizing with 512bit,
    256bit and 128bit vectors.  */
 
-static void
-ix86_autovectorize_vector_sizes (vector_sizes *sizes)
+static unsigned int
+ix86_autovectorize_vector_modes (vector_modes *modes, bool all)
 {
   if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
     {
-      sizes->safe_push (64);
-      sizes->safe_push (32);
-      sizes->safe_push (16);
+      modes->safe_push (V64QImode);
+      modes->safe_push (V32QImode);
+      modes->safe_push (V16QImode);
+    }
+  else if (TARGET_AVX512F && all)
+    {
+      modes->safe_push (V32QImode);
+      modes->safe_push (V16QImode);
+      modes->safe_push (V64QImode);
     }
   else if (TARGET_AVX && !TARGET_PREFER_AVX128)
     {
-      sizes->safe_push (32);
-      sizes->safe_push (16);
+      modes->safe_push (V32QImode);
+      modes->safe_push (V16QImode);
+    }
+  else if (TARGET_AVX && all)
+    {
+      modes->safe_push (V16QImode);
+      modes->safe_push (V32QImode);
     }
+
+  return 0;
 }
 
 /* Implemenation of targetm.vectorize.get_mask_mode.  */
 
 static opt_machine_mode
-ix86_get_mask_mode (poly_uint64 nunits, poly_uint64 vector_size)
+ix86_get_mask_mode (machine_mode data_mode)
 {
+  unsigned vector_size = GET_MODE_SIZE (data_mode);
+  unsigned nunits = GET_MODE_NUNITS (data_mode);
   unsigned elem_size = vector_size / nunits;
 
   /* Scalar mask case.  */
@@ -51849,9 +51855,9 @@ ix86_run_selftests (void)
 #undef TARGET_VECTORIZE_SPLIT_REDUCTION
 #define TARGET_VECTORIZE_SPLIT_REDUCTION \
   ix86_split_reduction
-#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
-#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
-  ix86_autovectorize_vector_sizes
+#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
+#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
+  ix86_autovectorize_vector_modes
 #undef TARGET_VECTORIZE_GET_MASK_MODE
 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
 #undef TARGET_VECTORIZE_INIT_COST
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 18cc39ae521..8c961f12a42 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -16441,10 +16441,9 @@
 	(unspec:VF_128_256
 	  [(match_operand:VF_128_256 1 "register_operand" "0,0,x")
 	   (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm")
-	   (subreg:VF_128_256
-	     (lt:<sseintvecmode>
-	       (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x")
-	       (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C")) 0)]
+	   (lt:VF_128_256
+	     (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x")
+	     (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C"))]
 	  UNSPEC_BLENDV))]
   "TARGET_SSE4_1"
   "#"
diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
index d758fbf1be6..1008947209e 100644
--- a/gcc/config/mips/mips.c
+++ b/gcc/config/mips/mips.c
@@ -13457,13 +13457,14 @@ mips_preferred_simd_mode (scalar_mode mode)
   return word_mode;
 }
 
-/* Implement TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES.  */
+/* Implement TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES.  */
 
-static void
-mips_autovectorize_vector_sizes (vector_sizes *sizes)
+static unsigned int
+mips_autovectorize_vector_modes (vector_modes *modes, bool)
 {
   if (ISA_HAS_MSA)
-    sizes->safe_push (16);
+    modes->safe_push (V16QImode);
+  return 0;
 }
 
 /* Implement TARGET_INIT_LIBFUNCS.  */
@@ -22676,9 +22677,9 @@ mips_starting_frame_offset (void)
 
 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE mips_preferred_simd_mode
-#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
-#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
-  mips_autovectorize_vector_sizes
+#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
+#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
+  mips_autovectorize_vector_modes
 
 #undef TARGET_INIT_BUILTINS
 #define TARGET_INIT_BUILTINS mips_init_builtins
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 87d60078bb0..8f046de424c 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -15457,7 +15457,7 @@ static tree
 fold_build_vec_cmp (tree_code code, tree type,
 		    tree arg0, tree arg1)
 {
-  tree cmp_type = build_same_sized_truth_vector_type (type);
+  tree cmp_type = truth_type_for (type);
   tree zero_vec = build_zero_cst (type);
   tree minus_one_vec = build_minus_one_cst (type);
   tree cmp = fold_build2 (code, cmp_type, arg0, arg1);
diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index db3f94978ec..c35666dec83 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -6588,7 +6588,7 @@ s390_expand_vec_compare_cc (rtx target, enum rtx_code code,
 	case LE:   cc_producer_mode = CCVFHEmode; code = GE; swap_p = true; break;
 	default: gcc_unreachable ();
 	}
-      scratch_mode = mode_for_int_vector (GET_MODE (cmp1)).require ();
+      scratch_mode = related_int_vector_mode (GET_MODE (cmp1)).require ();
 
       if (inv_p)
 	all_p = !all_p;
@@ -6694,7 +6694,7 @@ s390_expand_vcond (rtx target, rtx then, rtx els,
 
   /* We always use an integral type vector to hold the comparison
      result.  */
-  result_mode = mode_for_int_vector (cmp_mode).require ();
+  result_mode = related_int_vector_mode (cmp_mode).require ();
   result_target = gen_reg_rtx (result_mode);
 
   /* We allow vector immediates as comparison operands that
diff --git a/gcc/cp/call.c b/gcc/cp/call.c
index f365a5a7f7b..23a54f3c332 100644
--- a/gcc/cp/call.c
+++ b/gcc/cp/call.c
@@ -5161,7 +5161,7 @@ build_conditional_expr_1 (const op_location_t &loc,
 
       if (!COMPARISON_CLASS_P (arg1))
 	{
-	  tree cmp_type = build_same_sized_truth_vector_type (arg1_type);
+	  tree cmp_type = truth_type_for (arg1_type);
 	  arg1 = build2 (NE_EXPR, cmp_type, arg1, build_zero_cst (arg1_type));
 	}
       return build3_loc (loc, VEC_COND_EXPR, arg2_type, arg1, arg2, arg3);
diff --git a/gcc/cp/class.c b/gcc/cp/class.c
index 6b57184e081..5b0a60d61cc 100644
--- a/gcc/cp/class.c
+++ b/gcc/cp/class.c
@@ -4760,8 +4760,6 @@ adjust_clone_args (tree decl)
       tree orig_decl_parms = TYPE_ARG_TYPES (TREE_TYPE (decl));
       tree decl_parms, clone_parms;
 
-      clone_parms = orig_clone_parms;
-
       /* Skip the 'this' parameter.  */
       orig_clone_parms = TREE_CHAIN (orig_clone_parms);
       orig_decl_parms = TREE_CHAIN (orig_decl_parms);
@@ -8581,7 +8579,6 @@ dump_class_hierarchy_r (FILE *stream,
   tree base_binfo;
   int i;
 
-  indented = maybe_indent_hierarchy (stream, indent, 0);
   fprintf (stream, "%s (0x" HOST_WIDE_INT_PRINT_HEX ") ",
 	   type_as_string (BINFO_TYPE (binfo), TFF_PLAIN_IDENTIFIER),
 	   (HOST_WIDE_INT) (uintptr_t) binfo);
@@ -8602,7 +8599,6 @@ dump_class_hierarchy_r (FILE *stream,
     fprintf (stream, " virtual");
   fprintf (stream, "\n");
 
-  indented = 0;
   if (BINFO_PRIMARY_P (binfo))
     {
       indented = maybe_indent_hierarchy (stream, indent + 3, indented);
diff --git a/gcc/cp/decl.c b/gcc/cp/decl.c
index 39d55589ef3..5c82c2272c2 100644
--- a/gcc/cp/decl.c
+++ b/gcc/cp/decl.c
@@ -6387,7 +6387,7 @@ build_aggr_init_full_exprs (tree decl, tree init, int flags)
 static tree
 check_initializer (tree decl, tree init, int flags, vec<tree, va_gc> **cleanups)
 {
-  tree type = TREE_TYPE (decl);
+  tree type;
   tree init_code = NULL;
   tree core_type;
 
diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index e1c02d7b718..60fe58e0313 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -10485,7 +10485,7 @@ cp_parser_lambda_expression (cp_parser* parser)
     if (ok)
       maybe_add_lambda_conv_op (type);
 
-    type = finish_struct (type, /*attributes=*/NULL_TREE);
+    finish_struct (type, /*attributes=*/NULL_TREE);
 
     in_discarded_stmt = discarded;
 
diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c
index 4787747b6ff..ff7921533cb 100644
--- a/gcc/cp/pt.c
+++ b/gcc/cp/pt.c
@@ -7459,8 +7459,7 @@ unify_bound_ttp_args (tree tparms, tree targs, tree parm, tree& arg,
     {
       /* In keeping with P0522R0, adjust P's template arguments
 	 to apply to A's template; then flatten it again.  */
-      tree nparmvec = parmvec;
-      nparmvec = coerce_ttp_args_for_tta (arg, parmvec, tf_none);
+      tree nparmvec = coerce_ttp_args_for_tta (arg, parmvec, tf_none);
       nparmvec = expand_template_argument_pack (nparmvec);
 
       if (unify (tparms, targs, nparmvec, argvec,
@@ -7887,7 +7886,6 @@ convert_template_argument (tree parm,
 	 invalid, but static members are OK.  In any
 	 case, grab the underlying fields/functions
 	 and issue an error later if required.  */
-      orig_arg = TREE_VALUE (arg);
       TREE_TYPE (arg) = unknown_type_node;
     }
 
diff --git a/gcc/cp/rtti.c b/gcc/cp/rtti.c
index 3ca2b5e7b88..9aea6b939ec 100644
--- a/gcc/cp/rtti.c
+++ b/gcc/cp/rtti.c
@@ -209,8 +209,8 @@ build_headof (tree exp)
   offset = build_vtbl_ref (cp_build_fold_indirect_ref (exp),
                            index);
 
-  type = cp_build_qualified_type (ptr_type_node,
-				  cp_type_quals (TREE_TYPE (exp)));
+  cp_build_qualified_type (ptr_type_node,
+			   cp_type_quals (TREE_TYPE (exp)));
   return fold_build_pointer_plus (exp, offset);
 }
 
diff --git a/gcc/cp/typeck.c b/gcc/cp/typeck.c
index 2169f8c4efd..c42fd731cd2 100644
--- a/gcc/cp/typeck.c
+++ b/gcc/cp/typeck.c
@@ -4293,7 +4293,7 @@ build_vec_cmp (tree_code code, tree type,
 {
   tree zero_vec = build_zero_cst (type);
   tree minus_one_vec = build_minus_one_cst (type);
-  tree cmp_type = build_same_sized_truth_vector_type(type);
+  tree cmp_type = truth_type_for (type);
   tree cmp = build2 (code, cmp_type, arg0, arg1);
   return build3 (VEC_COND_EXPR, type, cmp, minus_one_vec, zero_vec);
 }
@@ -9189,8 +9189,6 @@ convert_for_initialization (tree exp, tree type, tree rhs, int flags,
   if (exp == error_mark_node)
     return error_mark_node;
 
-  rhstype = non_reference (rhstype);
-
   type = complete_type (type);
 
   if (DIRECT_INIT_EXPR_P (type, rhs))
diff --git a/gcc/cselib.c b/gcc/cselib.c
index 84c17c23f6d..108b2588cf9 100644
--- a/gcc/cselib.c
+++ b/gcc/cselib.c
@@ -2518,13 +2518,12 @@ cselib_record_sets (rtx_insn *insn)
   int n_sets = 0;
   int i;
   struct cselib_set sets[MAX_SETS];
-  rtx body = PATTERN (insn);
   rtx cond = 0;
   int n_sets_before_autoinc;
   int n_strict_low_parts = 0;
   struct cselib_record_autoinc_data data;
 
-  body = PATTERN (insn);
+  rtx body = PATTERN (insn);
   if (GET_CODE (body) == COND_EXEC)
     {
       cond = COND_EXEC_TEST (body);
diff --git a/gcc/d/d-codegen.cc b/gcc/d/d-codegen.cc
index 2abff92fc88..6f5499b08ee 100644
--- a/gcc/d/d-codegen.cc
+++ b/gcc/d/d-codegen.cc
@@ -1397,7 +1397,7 @@ build_boolop (tree_code code, tree arg0, tree arg1)
       /* Build a vector comparison.
 	 VEC_COND_EXPR <e1 op e2, { -1, -1, -1, -1 }, { 0, 0, 0, 0 }>; */
       tree type = TREE_TYPE (arg0);
-      tree cmptype = build_same_sized_truth_vector_type (type);
+      tree cmptype = truth_type_for (type);
       tree cmp = fold_build2_loc (input_location, code, cmptype, arg0, arg1);
 
       return fold_build3_loc (input_location, VEC_COND_EXPR, type, cmp,
diff --git a/gcc/df-scan.c b/gcc/df-scan.c
index 08d7af33371..84c2e54c855 100644
--- a/gcc/df-scan.c
+++ b/gcc/df-scan.c
@@ -229,7 +229,6 @@ void
 df_scan_alloc (bitmap all_blocks ATTRIBUTE_UNUSED)
 {
   struct df_scan_problem_data *problem_data;
-  unsigned int insn_num = get_max_uid () + 1;
   basic_block bb;
 
   /* Given the number of pools, this is really faster than tearing
@@ -257,7 +256,6 @@ df_scan_alloc (bitmap all_blocks ATTRIBUTE_UNUSED)
   bitmap_obstack_initialize (&problem_data->reg_bitmaps);
   bitmap_obstack_initialize (&problem_data->insn_bitmaps);
 
-  insn_num += insn_num / 4;
   df_grow_reg_info ();
 
   df_grow_insn_info ();
diff --git a/gcc/doc/poly-int.texi b/gcc/doc/poly-int.texi
index 1023e823cb3..d60bb02aabf 100644
--- a/gcc/doc/poly-int.texi
+++ b/gcc/doc/poly-int.texi
@@ -803,6 +803,18 @@ the assertion is known to hold.
 @item constant_lower_bound (@var{a})
 Assert that @var{a} is nonnegative and return the smallest value it can have.
 
+@item constant_lower_bound_with_limit (@var{a}, @var{b})
+Return the least value @var{a} can have, given that the context in
+which @var{a} appears guarantees that the answer is no less than @var{b}.
+In other words, the caller is asserting that @var{a} is greater than or
+equal to @var{b} even if @samp{known_ge (@var{a}, @var{b})} doesn't hold.
+
+@item constant_upper_bound_with_limit (@var{a}, @var{b})
+Return the greatest value @var{a} can have, given that the context in
+which @var{a} appears guarantees that the answer is no greater than @var{b}.
+In other words, the caller is asserting that @var{a} is less than or equal
+to @var{b} even if @samp{known_le (@var{a}, @var{b})} doesn't hold.
+
 @item lower_bound (@var{a}, @var{b})
 Return a value that is always less than or equal to both @var{a} and @var{b}.
 It will be the greatest such value for some indeterminate values
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 8c8978bb13a..73db70867b4 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6016,27 +6016,71 @@ against lower halves of vectors recursively until the specified mode is
 reached.  The default is @var{mode} which means no splitting.
 @end deftypefn
 
-@deftypefn {Target Hook} void TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES (vector_sizes *@var{sizes})
-If the mode returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is not
-the only one that is worth considering, this hook should add all suitable
-vector sizes to @var{sizes}, in order of decreasing preference.  The first
-one should be the size of @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE}.
+@deftypefn {Target Hook} {unsigned int} TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES (vector_modes *@var{modes}, bool @var{all})
+If using the mode returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE}
+is not the only approach worth considering, this hook should add one mode to
+@var{modes} for each useful alternative approach.  These modes are then
+passed to @code{TARGET_VECTORIZE_RELATED_MODE} to obtain the vector mode
+for a given element mode.
+
+The modes returned in @var{modes} should use the smallest element mode
+possible for the vectorization approach that they represent, preferring
+integer modes over floating-poing modes in the event of a tie.  The first
+mode should be the @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} for its
+element mode.
+
+If @var{all} is true, add suitable vector modes even when they are generally
+not expected to be worthwhile.
+
+The hook returns a bitmask of flags that control how the modes in
+@var{modes} are used.  The flags are:
+@table @code
+@item VECT_COMPARE_COSTS
+Tells the loop vectorizer to try all the provided modes and pick the one
+with the lowest cost.  By default the vectorizer will choose the first
+mode that works.
+@end table
 
 The hook does not need to do anything if the vector returned by
 @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is the only one relevant
-for autovectorization.  The default implementation does nothing.
-@end deftypefn
-
-@deftypefn {Target Hook} opt_machine_mode TARGET_VECTORIZE_GET_MASK_MODE (poly_uint64 @var{nunits}, poly_uint64 @var{length})
-A vector mask is a value that holds one boolean result for every element
-in a vector.  This hook returns the machine mode that should be used to
-represent such a mask when the vector in question is @var{length} bytes
-long and contains @var{nunits} elements.  The hook returns an empty
-@code{opt_machine_mode} if no such mode exists.
-
-The default implementation returns the mode of an integer vector that
-is @var{length} bytes long and that contains @var{nunits} elements,
-if such a mode exists.
+for autovectorization.  The default implementation adds no modes and
+returns 0.
+@end deftypefn
+
+@deftypefn {Target Hook} opt_machine_mode TARGET_VECTORIZE_RELATED_MODE (machine_mode @var{vector_mode}, scalar_mode @var{element_mode}, poly_uint64 @var{nunits})
+If a piece of code is using vector mode @var{vector_mode} and also wants
+to operate on elements of mode @var{element_mode}, return the vector mode
+it should use for those elements.  If @var{nunits} is nonzero, ensure that
+the mode has exactly @var{nunits} elements, otherwise pick whichever vector
+size pairs the most naturally with @var{vector_mode}.  Return an empty
+@code{opt_machine_mode} if there is no supported vector mode with the
+required properties.
+
+There is no prescribed way of handling the case in which @var{nunits}
+is zero.  One common choice is to pick a vector mode with the same size
+as @var{vector_mode}; this is the natural choice if the target has a
+fixed vector size.  Another option is to choose a vector mode with the
+same number of elements as @var{vector_mode}; this is the natural choice
+if the target has a fixed number of elements.  Alternatively, the hook
+might choose a middle ground, such as trying to keep the number of
+elements as similar as possible while applying maximum and minimum
+vector sizes.
+
+The default implementation uses @code{mode_for_vector} to find the
+requested mode, returning a mode with the same size as @var{vector_mode}
+when @var{nunits} is zero.  This is the correct behavior for most targets.
+@end deftypefn
+
+@deftypefn {Target Hook} opt_machine_mode TARGET_VECTORIZE_GET_MASK_MODE (machine_mode @var{mode})
+Return the mode to use for a vector mask that holds one boolean
+result for each element of vector mode @var{mode}.  The returned mask mode
+can be a vector of integers (class @code{MODE_VECTOR_INT}), a vector of
+booleans (class @code{MODE_VECTOR_BOOL}) or a scalar integer (class
+@code{MODE_INT}).  Return an empty @code{opt_machine_mode} if no such
+mask mode exists.
+
+The default implementation returns a @code{MODE_VECTOR_INT} with the
+same size and number of elements as @var{mode}, if such a mode exists.
 @end deftypefn
 
 @deftypefn {Target Hook} bool TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE (unsigned @var{ifn})
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index fe1194ef91a..bc362dca0f5 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4172,7 +4172,9 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_SPLIT_REDUCTION
 
-@hook TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
+@hook TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
+
+@hook TARGET_VECTORIZE_RELATED_MODE
 
 @hook TARGET_VECTORIZE_GET_MASK_MODE
 
diff --git a/gcc/dojump.c b/gcc/dojump.c
index 8626689463e..bac37a357a9 100644
--- a/gcc/dojump.c
+++ b/gcc/dojump.c
@@ -668,8 +668,6 @@ do_jump_by_parts_greater_rtx (scalar_int_mode mode, int unsignedp, rtx op0,
       code = LE;
       if_true_label = if_false_label;
       if_false_label = drop_through_label;
-      drop_through_if_true = false;
-      drop_through_if_false = true;
       prob = prob.invert ();
     }
 
diff --git a/gcc/early-remat.c b/gcc/early-remat.c
index 122891c1edb..0396f16babf 100644
--- a/gcc/early-remat.c
+++ b/gcc/early-remat.c
@@ -1123,7 +1123,6 @@ early_remat::record_equiv_candidates (unsigned int cand1_index,
       ec->representative = cand1_index;
       cand1->equiv_class = ec;
     }
-  cand1 = &m_candidates[ec->representative];
   cand2->equiv_class = ec;
   bitmap_set_bit (ec->members, cand2_index);
   if (cand2_index > ec->representative)
diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c
index 15dffa58a2e..78104603c76 100644
--- a/gcc/emit-rtl.c
+++ b/gcc/emit-rtl.c
@@ -3993,7 +3993,7 @@ try_split (rtx pat, rtx_insn *trial, int last)
   before = PREV_INSN (trial);
   after = NEXT_INSN (trial);
 
-  tem = emit_insn_after_setloc (seq, trial, INSN_LOCATION (trial));
+  emit_insn_after_setloc (seq, trial, INSN_LOCATION (trial));
 
   delete_insn (trial);
 
diff --git a/gcc/expmed.c b/gcc/expmed.c
index c5f5499c013..34cdfbf151a 100644
--- a/gcc/expmed.c
+++ b/gcc/expmed.c
@@ -1662,12 +1662,10 @@ extract_bit_field_1 (rtx str_rtx, poly_uint64 bitsize, poly_uint64 bitnum,
 	  poly_uint64 nunits;
 	  if (!multiple_p (GET_MODE_BITSIZE (GET_MODE (op0)),
 			   GET_MODE_UNIT_BITSIZE (tmode), &nunits)
-	      || !mode_for_vector (inner_mode, nunits).exists (&new_mode)
-	      || !VECTOR_MODE_P (new_mode)
+	      || !related_vector_mode (tmode, inner_mode,
+				       nunits).exists (&new_mode)
 	      || maybe_ne (GET_MODE_SIZE (new_mode),
-			   GET_MODE_SIZE (GET_MODE (op0)))
-	      || GET_MODE_INNER (new_mode) != GET_MODE_INNER (tmode)
-	      || !targetm.vector_mode_supported_p (new_mode))
+			   GET_MODE_SIZE (GET_MODE (op0))))
 	    new_mode = VOIDmode;
 	}
       poly_uint64 pos;
diff --git a/gcc/expr.c b/gcc/expr.c
index fa15b7eceae..5e3700fe15f 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -249,6 +249,31 @@ convert_move (rtx to, rtx from, int unsignedp)
 
   if (VECTOR_MODE_P (to_mode) || VECTOR_MODE_P (from_mode))
     {
+      if (GET_MODE_UNIT_PRECISION (to_mode)
+	  > GET_MODE_UNIT_PRECISION (from_mode))
+	{
+	  optab op = unsignedp ? zext_optab : sext_optab;
+	  insn_code icode = convert_optab_handler (op, to_mode, from_mode);
+	  if (icode != CODE_FOR_nothing)
+	    {
+	      emit_unop_insn (icode, to, from,
+			      unsignedp ? ZERO_EXTEND : SIGN_EXTEND);
+	      return;
+	    }
+	}
+
+      if (GET_MODE_UNIT_PRECISION (to_mode)
+	  < GET_MODE_UNIT_PRECISION (from_mode))
+	{
+	  insn_code icode = convert_optab_handler (trunc_optab,
+						   to_mode, from_mode);
+	  if (icode != CODE_FOR_nothing)
+	    {
+	      emit_unop_insn (icode, to, from, TRUNCATE);
+	      return;
+	    }
+	}
+
       gcc_assert (known_eq (GET_MODE_BITSIZE (from_mode),
 			    GET_MODE_BITSIZE (to_mode)));
 
diff --git a/gcc/fold-const.h b/gcc/fold-const.h
index 049fee91876..e2e66246315 100644
--- a/gcc/fold-const.h
+++ b/gcc/fold-const.h
@@ -83,7 +83,7 @@ extern bool fold_deferring_overflow_warnings_p (void);
 extern void fold_overflow_warning (const char*, enum warn_strict_overflow_code);
 extern enum tree_code fold_div_compare (enum tree_code, tree, tree,
 					tree *, tree *, bool *);
-extern int operand_equal_p (const_tree, const_tree, unsigned int);
+extern int operand_equal_p (const_tree, const_tree, unsigned int flags = 0);
 extern int multiple_of_p (tree, const_tree, const_tree);
 #define omit_one_operand(T1,T2,T3)\
    omit_one_operand_loc (UNKNOWN_LOCATION, T1, T2, T3)
diff --git a/gcc/fwprop.c b/gcc/fwprop.c
index cf2c9de2d35..f2966fadae8 100644
--- a/gcc/fwprop.c
+++ b/gcc/fwprop.c
@@ -448,6 +448,18 @@ enum {
   PR_OPTIMIZE_FOR_SPEED = 4
 };
 
+/* Check that X has a single def.  */
+
+static bool
+reg_single_def_p (rtx x)
+{
+  if (!REG_P (x))
+    return false;
+
+  int regno = REGNO (x);
+  return (DF_REG_DEF_COUNT (regno) == 1
+	  && !bitmap_bit_p (DF_LR_OUT (ENTRY_BLOCK_PTR_FOR_FN (cfun)), regno));
+}
 
 /* Replace all occurrences of OLD in *PX with NEW and try to simplify the
    resulting expression.  Replace *PX with a new RTL expression if an
@@ -547,6 +559,54 @@ propagate_rtx_1 (rtx *px, rtx old_rtx, rtx new_rtx, int flags)
 	  tem = simplify_gen_subreg (mode, op0, GET_MODE (SUBREG_REG (x)),
 				     SUBREG_BYTE (x));
 	}
+
+      else
+	{
+	  rtvec vec;
+	  rtvec newvec;
+	  const char *fmt = GET_RTX_FORMAT (code);
+	  rtx op;
+
+	  for (int i = 0; fmt[i]; i++)
+	    switch (fmt[i])
+	      {
+	      case 'E':
+		vec = XVEC (x, i);
+		newvec = vec;
+		for (int j = 0; j < GET_NUM_ELEM (vec); j++)
+		  {
+		    op = RTVEC_ELT (vec, j);
+		    valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags);
+		    if (op != RTVEC_ELT (vec, j))
+		      {
+			if (newvec == vec)
+			  {
+			    newvec = shallow_copy_rtvec (vec);
+			    if (!tem)
+			      tem = shallow_copy_rtx (x);
+			    XVEC (tem, i) = newvec;
+			  }
+			RTVEC_ELT (newvec, j) = op;
+		      }
+		  }
+	        break;
+
+	      case 'e':
+		if (XEXP (x, i))
+		  {
+		    op = XEXP (x, i);
+		    valid_ops &= propagate_rtx_1 (&op, old_rtx, new_rtx, flags);
+		    if (op != XEXP (x, i))
+		      {
+			if (!tem)
+			  tem = shallow_copy_rtx (x);
+			XEXP (tem, i) = op;
+		      }
+		  }
+	        break;
+	      }
+	}
+
       break;
 
     case RTX_OBJ:
@@ -1370,10 +1430,11 @@ forward_propagate_and_simplify (df_ref use, rtx_insn *def_insn, rtx def_set)
 
 /* Given a use USE of an insn, if it has a single reaching
    definition, try to forward propagate it into that insn.
-   Return true if cfg cleanup will be needed.  */
+   Return true if cfg cleanup will be needed.
+   REG_PROP_ONLY is true if we should only propagate register copies.  */
 
 static bool
-forward_propagate_into (df_ref use)
+forward_propagate_into (df_ref use, bool reg_prop_only = false)
 {
   df_ref def;
   rtx_insn *def_insn, *use_insn;
@@ -1394,10 +1455,6 @@ forward_propagate_into (df_ref use)
   if (DF_REF_IS_ARTIFICIAL (def))
     return false;
 
-  /* Do not propagate loop invariant definitions inside the loop.  */
-  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father)
-    return false;
-
   /* Check if the use is still present in the insn!  */
   use_insn = DF_REF_INSN (use);
   if (DF_REF_FLAGS (use) & DF_REF_IN_NOTE)
@@ -1415,6 +1472,19 @@ forward_propagate_into (df_ref use)
   if (!def_set)
     return false;
 
+  if (reg_prop_only
+      && (!reg_single_def_p (SET_SRC (def_set))
+	  || !reg_single_def_p (SET_DEST (def_set))))
+    return false;
+
+  /* Allow propagations into a loop only for reg-to-reg copies, since
+     replacing one register by another shouldn't increase the cost.  */
+
+  if (DF_REF_BB (def)->loop_father != DF_REF_BB (use)->loop_father
+      && (!reg_single_def_p (SET_SRC (def_set))
+	  || !reg_single_def_p (SET_DEST (def_set))))
+    return false;
+
   /* Only try one kind of propagation.  If two are possible, we'll
      do it on the following iterations.  */
   if (forward_propagate_and_simplify (use, def_insn, def_set)
@@ -1483,7 +1553,7 @@ gate_fwprop (void)
 }
 
 static unsigned int
-fwprop (void)
+fwprop (bool fwprop_addr_p)
 {
   unsigned i;
 
@@ -1502,11 +1572,16 @@ fwprop (void)
 
       df_ref use = DF_USES_GET (i);
       if (use)
-	if (DF_REF_TYPE (use) == DF_REF_REG_USE
-	    || DF_REF_BB (use)->loop_father == NULL
-	    /* The outer most loop is not really a loop.  */
-	    || loop_outer (DF_REF_BB (use)->loop_father) == NULL)
-	  forward_propagate_into (use);
+	{
+	  if (DF_REF_TYPE (use) == DF_REF_REG_USE
+	      || DF_REF_BB (use)->loop_father == NULL
+	      /* The outer most loop is not really a loop.  */
+	      || loop_outer (DF_REF_BB (use)->loop_father) == NULL)
+	    forward_propagate_into (use, fwprop_addr_p);
+
+	  else if (fwprop_addr_p)
+	    forward_propagate_into (use, false);
+	}
     }
 
   fwprop_done ();
@@ -1537,7 +1612,7 @@ public:
 
   /* opt_pass methods: */
   virtual bool gate (function *) { return gate_fwprop (); }
-  virtual unsigned int execute (function *) { return fwprop (); }
+  virtual unsigned int execute (function *) { return fwprop (false); }
 
 }; // class pass_rtl_fwprop
 
@@ -1549,33 +1624,6 @@ make_pass_rtl_fwprop (gcc::context *ctxt)
   return new pass_rtl_fwprop (ctxt);
 }
 
-static unsigned int
-fwprop_addr (void)
-{
-  unsigned i;
-
-  fwprop_init ();
-
-  /* Go through all the uses.  df_uses_create will create new ones at the
-     end, and we'll go through them as well.  */
-  for (i = 0; i < DF_USES_TABLE_SIZE (); i++)
-    {
-      if (!propagations_left)
-	break;
-
-      df_ref use = DF_USES_GET (i);
-      if (use)
-	if (DF_REF_TYPE (use) != DF_REF_REG_USE
-	    && DF_REF_BB (use)->loop_father != NULL
-	    /* The outer most loop is not really a loop.  */
-	    && loop_outer (DF_REF_BB (use)->loop_father) != NULL)
-	  forward_propagate_into (use);
-    }
-
-  fwprop_done ();
-  return 0;
-}
-
 namespace {
 
 const pass_data pass_data_rtl_fwprop_addr =
@@ -1600,7 +1648,7 @@ public:
 
   /* opt_pass methods: */
   virtual bool gate (function *) { return gate_fwprop (); }
-  virtual unsigned int execute (function *) { return fwprop_addr (); }
+  virtual unsigned int execute (function *) { return fwprop (true); }
 
 }; // class pass_rtl_fwprop_addr
 
diff --git a/gcc/gimple.c b/gcc/gimple.c
index 8fae60fb848..bf362dbe545 100644
--- a/gcc/gimple.c
+++ b/gcc/gimple.c
@@ -1771,6 +1771,8 @@ gimple_get_lhs (const gimple *stmt)
     return gimple_assign_lhs (stmt);
   else if (code == GIMPLE_CALL)
     return gimple_call_lhs (stmt);
+  else if (code == GIMPLE_PHI)
+    return gimple_phi_result (stmt);
   else
     return NULL_TREE;
 }
diff --git a/gcc/graphite-scop-detection.c b/gcc/graphite-scop-detection.c
index 4534d43721f..489d0b93b42 100644
--- a/gcc/graphite-scop-detection.c
+++ b/gcc/graphite-scop-detection.c
@@ -1105,14 +1105,12 @@ assign_parameter_index_in_region (tree name, sese_info_p region)
   gcc_assert (TREE_CODE (name) == SSA_NAME
 	      && INTEGRAL_TYPE_P (TREE_TYPE (name))
 	      && ! defined_in_sese_p (name, region->region));
-
   int i;
   tree p;
   FOR_EACH_VEC_ELT (region->params, i, p)
     if (p == name)
       return;
 
-  i = region->params.length ();
   region->params.safe_push (name);
 }
 
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index 95788dfee7d..21ecd566766 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -100,7 +100,7 @@ init_internal_fns ()
 /* Create static initializers for the information returned by
    direct_internal_fn.  */
 #define not_direct { -2, -2, false }
-#define mask_load_direct { -1, 2, false }
+#define mask_load_direct { -1, 2, true }
 #define load_lanes_direct { -1, -1, false }
 #define mask_load_lanes_direct { -1, -1, false }
 #define gather_load_direct { -1, -1, false }
diff --git a/gcc/ira-color.c b/gcc/ira-color.c
index aa91b56c81f..8a90ae1b4e6 100644
--- a/gcc/ira-color.c
+++ b/gcc/ira-color.c
@@ -1108,7 +1108,6 @@ setup_profitable_hard_regs (void)
 	  || empty_profitable_hard_regs (a))
 	continue;
       data = ALLOCNO_COLOR_DATA (a);
-      mode = ALLOCNO_MODE (a);
       if ((costs = ALLOCNO_UPDATED_HARD_REG_COSTS (a)) != NULL
 	  || (costs = ALLOCNO_HARD_REG_COSTS (a)) != NULL)
 	{
diff --git a/gcc/ira.c b/gcc/ira.c
index b330f2a287b..4262e5cf3b7 100644
--- a/gcc/ira.c
+++ b/gcc/ira.c
@@ -4414,10 +4414,9 @@ rtx_moveable_p (rtx *loc, enum op_type type)
 {
   const char *fmt;
   rtx x = *loc;
-  enum rtx_code code = GET_CODE (x);
   int i, j;
 
-  code = GET_CODE (x);
+  enum rtx_code code = GET_CODE (x);
   switch (code)
     {
     case CONST:
diff --git a/gcc/lra-eliminations.c b/gcc/lra-eliminations.c
index ee9fd51f129..7a345a52ae1 100644
--- a/gcc/lra-eliminations.c
+++ b/gcc/lra-eliminations.c
@@ -1146,7 +1146,6 @@ eliminate_regs_in_insn (rtx_insn *insn, bool replace_p, bool first_p,
      single_set without having put new body into the insn and the
      re-recognition won't hurt in this rare case.  */
   id = lra_update_insn_recog_data (insn);
-  static_id = id->insn_static_data;
 }
 
 /* Spill pseudos which are assigned to hard registers in SET.  Add
diff --git a/gcc/lra.c b/gcc/lra.c
index 1d2578f8c12..10b85340fc5 100644
--- a/gcc/lra.c
+++ b/gcc/lra.c
@@ -1029,12 +1029,8 @@ lra_set_insn_recog_data (rtx_insn *insn)
 			       data->operand_loc,
 			       constraints, operand_mode, NULL);
 	  if (nop > 0)
-	    {
-	      const char *p =  recog_data.constraints[0];
-
-	      for (p =	constraints[0]; *p; p++)
-		nalt += *p == ',';
-	    }
+	    for (const char *p =constraints[0]; *p; p++)
+	      nalt += *p == ',';
 	  data->insn_static_data = insn_static_data
 	    = get_static_insn_data (-1, nop, 0, nalt);
 	  for (i = 0; i < nop; i++)
diff --git a/gcc/machmode.h b/gcc/machmode.h
index 3a7cee88962..d564f9c6458 100644
--- a/gcc/machmode.h
+++ b/gcc/machmode.h
@@ -257,6 +257,9 @@ public:
   bool exists () const;
   template<typename U> bool exists (U *) const;
 
+  bool operator== (const T &m) const { return m_mode == m; }
+  bool operator!= (const T &m) const { return m_mode != m; }
+
 private:
   machine_mode m_mode;
 };
@@ -841,20 +844,9 @@ smallest_int_mode_for_size (poly_uint64 size)
 extern opt_scalar_int_mode int_mode_for_mode (machine_mode);
 extern opt_machine_mode bitwise_mode_for_mode (machine_mode);
 extern opt_machine_mode mode_for_vector (scalar_mode, poly_uint64);
-extern opt_machine_mode mode_for_int_vector (unsigned int, poly_uint64);
-
-/* Return the integer vector equivalent of MODE, if one exists.  In other
-   words, return the mode for an integer vector that has the same number
-   of bits as MODE and the same number of elements as MODE, with the
-   latter being 1 if MODE is scalar.  The returned mode can be either
-   an integer mode or a vector mode.  */
-
-inline opt_machine_mode
-mode_for_int_vector (machine_mode mode)
-{
-  return mode_for_int_vector (GET_MODE_UNIT_BITSIZE (mode),
-			      GET_MODE_NUNITS (mode));
-}
+extern opt_machine_mode related_vector_mode (machine_mode, scalar_mode,
+					     poly_uint64 = 0);
+extern opt_machine_mode related_int_vector_mode (machine_mode);
 
 /* A class for iterating through possible bitfield modes.  */
 class bit_field_mode_iterator
diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c
index 74159734fc8..0d7f104a2f2 100644
--- a/gcc/omp-expand.c
+++ b/gcc/omp-expand.c
@@ -4974,6 +4974,13 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
 	  && loop->safelen > 1)
 	{
 	  loop->force_vectorize = true;
+	  if (simdlen && tree_fits_uhwi_p (OMP_CLAUSE_SIMDLEN_EXPR (simdlen)))
+	    {
+	      unsigned HOST_WIDE_INT v
+		= tree_to_uhwi (OMP_CLAUSE_SIMDLEN_EXPR (simdlen));
+	      if (v < INT_MAX && v <= (unsigned HOST_WIDE_INT) loop->safelen)
+		loop->simdlen = v;
+	    }
 	  cfun->has_force_vectorize_loops = true;
 	}
       else if (dont_vectorize)
diff --git a/gcc/omp-general.c b/gcc/omp-general.c
index 356772ff458..4fb53af7587 100644
--- a/gcc/omp-general.c
+++ b/gcc/omp-general.c
@@ -468,13 +468,16 @@ omp_max_vf (void)
 	  && global_options_set.x_flag_tree_loop_vectorize))
     return 1;
 
-  auto_vector_sizes sizes;
-  targetm.vectorize.autovectorize_vector_sizes (&sizes);
-  if (!sizes.is_empty ())
+  auto_vector_modes modes;
+  targetm.vectorize.autovectorize_vector_modes (&modes, true);
+  if (!modes.is_empty ())
     {
       poly_uint64 vf = 0;
-      for (unsigned int i = 0; i < sizes.length (); ++i)
-	vf = ordered_max (vf, sizes[i]);
+      for (unsigned int i = 0; i < modes.length (); ++i)
+	/* The returned modes use the smallest element size (and thus
+	   the largest nunits) for the vectorization approach that they
+	   represent.  */
+	vf = ordered_max (vf, GET_MODE_NUNITS (modes[i]));
       return vf;
     }
 
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 813cefd69b9..7866639f76c 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -3650,11 +3650,8 @@ omp_clause_aligned_alignment (tree clause)
   /* Otherwise return implementation defined alignment.  */
   unsigned int al = 1;
   opt_scalar_mode mode_iter;
-  auto_vector_sizes sizes;
-  targetm.vectorize.autovectorize_vector_sizes (&sizes);
-  poly_uint64 vs = 0;
-  for (unsigned int i = 0; i < sizes.length (); ++i)
-    vs = ordered_max (vs, sizes[i]);
+  auto_vector_modes modes;
+  targetm.vectorize.autovectorize_vector_modes (&modes, true);
   static enum mode_class classes[]
     = { MODE_INT, MODE_VECTOR_INT, MODE_FLOAT, MODE_VECTOR_FLOAT };
   for (int i = 0; i < 4; i += 2)
@@ -3665,19 +3662,18 @@ omp_clause_aligned_alignment (tree clause)
 	machine_mode vmode = targetm.vectorize.preferred_simd_mode (mode);
 	if (GET_MODE_CLASS (vmode) != classes[i + 1])
 	  continue;
-	while (maybe_ne (vs, 0U)
-	       && known_lt (GET_MODE_SIZE (vmode), vs)
-	       && GET_MODE_2XWIDER_MODE (vmode).exists ())
-	  vmode = GET_MODE_2XWIDER_MODE (vmode).require ();
+	machine_mode alt_vmode;
+	for (unsigned int j = 0; j < modes.length (); ++j)
+	  if (related_vector_mode (modes[j], mode).exists (&alt_vmode)
+	      && known_ge (GET_MODE_SIZE (alt_vmode), GET_MODE_SIZE (vmode)))
+	    vmode = alt_vmode;
 
 	tree type = lang_hooks.types.type_for_mode (mode, 1);
 	if (type == NULL_TREE || TYPE_MODE (type) != mode)
 	  continue;
-	poly_uint64 nelts = exact_div (GET_MODE_SIZE (vmode),
-				       GET_MODE_SIZE (mode));
-	type = build_vector_type (type, nelts);
-	if (TYPE_MODE (type) != vmode)
-	  continue;
+	type = build_vector_type_for_mode (type, vmode);
+	/* The functions above are not allowed to return invalid modes.  */
+	gcc_assert (TYPE_MODE (type) == vmode);
 	if (TYPE_ALIGN_UNIT (type) > al)
 	  al = TYPE_ALIGN_UNIT (type);
       }
diff --git a/gcc/optabs-query.c b/gcc/optabs-query.c
index 71c73fb43cc..61de7dc283b 100644
--- a/gcc/optabs-query.c
+++ b/gcc/optabs-query.c
@@ -354,11 +354,8 @@ can_conditionally_move_p (machine_mode mode)
 opt_machine_mode
 qimode_for_vec_perm (machine_mode mode)
 {
-  machine_mode qimode;
-  if (GET_MODE_INNER (mode) != QImode
-      && mode_for_vector (QImode, GET_MODE_SIZE (mode)).exists (&qimode)
-      && VECTOR_MODE_P (qimode))
-    return qimode;
+  if (GET_MODE_INNER (mode) != QImode)
+    return related_vector_mode (mode, QImode, GET_MODE_SIZE (mode));
   return opt_machine_mode ();
 }
 
@@ -587,22 +584,21 @@ can_vec_mask_load_store_p (machine_mode mode,
   if (!VECTOR_MODE_P (vmode))
     return false;
 
-  if ((targetm.vectorize.get_mask_mode
-       (GET_MODE_NUNITS (vmode), GET_MODE_SIZE (vmode)).exists (&mask_mode))
+  if (targetm.vectorize.get_mask_mode (vmode).exists (&mask_mode)
       && convert_optab_handler (op, vmode, mask_mode) != CODE_FOR_nothing)
     return true;
 
-  auto_vector_sizes vector_sizes;
-  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
-  for (unsigned int i = 0; i < vector_sizes.length (); ++i)
+  auto_vector_modes vector_modes;
+  targetm.vectorize.autovectorize_vector_modes (&vector_modes, true);
+  for (unsigned int i = 0; i < vector_modes.length (); ++i)
     {
-      poly_uint64 cur = vector_sizes[i];
+      poly_uint64 cur = GET_MODE_SIZE (vector_modes[i]);
       poly_uint64 nunits;
       if (!multiple_p (cur, GET_MODE_SIZE (smode), &nunits))
 	continue;
       if (mode_for_vector (smode, nunits).exists (&vmode)
 	  && VECTOR_MODE_P (vmode)
-	  && targetm.vectorize.get_mask_mode (nunits, cur).exists (&mask_mode)
+	  && targetm.vectorize.get_mask_mode (vmode).exists (&mask_mode)
 	  && convert_optab_handler (op, vmode, mask_mode) != CODE_FOR_nothing)
 	return true;
     }
diff --git a/gcc/optabs-tree.c b/gcc/optabs-tree.c
index 8157798cc71..341e02bd51c 100644
--- a/gcc/optabs-tree.c
+++ b/gcc/optabs-tree.c
@@ -300,6 +300,20 @@ supportable_convert_operation (enum tree_code code,
       return true;
     }
 
+  if (GET_MODE_UNIT_PRECISION (m1) > GET_MODE_UNIT_PRECISION (m2)
+      && can_extend_p (m1, m2, TYPE_UNSIGNED (vectype_in)))
+    {
+      *code1 = code;
+      return true;
+    }
+
+  if (GET_MODE_UNIT_PRECISION (m1) < GET_MODE_UNIT_PRECISION (m2)
+      && convert_optab_handler (trunc_optab, m1, m2) != CODE_FOR_nothing)
+    {
+      *code1 = code;
+      return true;
+    }
+
   /* Now check for builtin.  */
   if (targetm.vectorize.builtin_conversion
       && targetm.vectorize.builtin_conversion (code, vectype_out, vectype_in))
diff --git a/gcc/optabs.c b/gcc/optabs.c
index 7d7efe0a4a2..c2c1274ebdb 100644
--- a/gcc/optabs.c
+++ b/gcc/optabs.c
@@ -2095,8 +2095,8 @@ expand_twoval_binop (optab binoptab, rtx op0, rtx op1, rtx targ0, rtx targ1,
       xop1 = avoid_expensive_constant (mode1, binoptab, 1, xop1, unsignedp);
 
       create_fixed_operand (&ops[0], targ0);
-      create_convert_operand_from (&ops[1], op0, mode, unsignedp);
-      create_convert_operand_from (&ops[2], op1, mode, unsignedp);
+      create_convert_operand_from (&ops[1], xop0, mode, unsignedp);
+      create_convert_operand_from (&ops[2], xop1, mode, unsignedp);
       create_fixed_operand (&ops[3], targ1);
       if (maybe_expand_insn (icode, 4, ops))
 	return 1;
@@ -5486,7 +5486,7 @@ expand_vec_perm_1 (enum insn_code icode, rtx target,
   struct expand_operand ops[4];
 
   gcc_assert (GET_MODE_CLASS (smode) == MODE_VECTOR_INT
-	      || mode_for_int_vector (tmode).require () == smode);
+	      || related_int_vector_mode (tmode).require () == smode);
   create_output_operand (&ops[0], target, tmode);
   create_input_operand (&ops[3], sel, smode);
 
@@ -5611,8 +5611,7 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
   /* The optabs are only defined for selectors with the same width
      as the values being permuted.  */
   machine_mode required_sel_mode;
-  if (!mode_for_int_vector (mode).exists (&required_sel_mode)
-      || !VECTOR_MODE_P (required_sel_mode))
+  if (!related_int_vector_mode (mode).exists (&required_sel_mode))
     {
       delete_insns_since (last);
       return NULL_RTX;
diff --git a/gcc/params.def b/gcc/params.def
index 3f18642475a..b269045fb9c 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -1403,7 +1403,7 @@ DEFPARAM (PARAM_MAX_VRP_SWITCH_ASSERTIONS,
 DEFPARAM (PARAM_VECT_EPILOGUES_NOMASK,
 	  "vect-epilogues-nomask",
 	  "Enable loop epilogue vectorization using smaller vector size.",
-	  0, 0, 1)
+	  1, 0, 1)
 
 DEFPARAM(PARAM_UNROLL_JAM_MIN_PERCENT,
 	 "unroll-jam-min-percent",
diff --git a/gcc/poly-int.h b/gcc/poly-int.h
index d68a652b5fa..ba39ca471be 100644
--- a/gcc/poly-int.h
+++ b/gcc/poly-int.h
@@ -1528,6 +1528,29 @@ constant_lower_bound (const poly_int_pod<N, Ca> &a)
   return a.coeffs[0];
 }
 
+/* Return the constant lower bound of A, given that it is no less than B.  */
+
+template<unsigned int N, typename Ca, typename Cb>
+inline POLY_CONST_COEFF (Ca, Cb)
+constant_lower_bound_with_limit (const poly_int_pod<N, Ca> &a, const Cb &b)
+{
+  if (known_ge (a, b))
+    return a.coeffs[0];
+  return b;
+}
+
+/* Return the constant upper bound of A, given that it is no greater
+   than B.  */
+
+template<unsigned int N, typename Ca, typename Cb>
+inline POLY_CONST_COEFF (Ca, Cb)
+constant_upper_bound_with_limit (const poly_int_pod<N, Ca> &a, const Cb &b)
+{
+  if (known_le (a, b))
+    return a.coeffs[0];
+  return b;
+}
+
 /* Return a value that is known to be no greater than A and B.  This
    will be the greatest lower bound for some indeterminate values but
    not necessarily for all.  */
diff --git a/gcc/read-rtl.c b/gcc/read-rtl.c
index ebd69bde531..1af51f686c7 100644
--- a/gcc/read-rtl.c
+++ b/gcc/read-rtl.c
@@ -1282,7 +1282,7 @@ read_subst_mapping (htab_t subst_iters_table, htab_t subst_attrs_table,
       m = add_mapping (&substs, subst_iters_table, attr_operands[1]);
       end_ptr = &m->values;
       end_ptr = add_map_value (end_ptr, 1, "");
-      end_ptr = add_map_value (end_ptr, 2, "");
+      add_map_value (end_ptr, 2, "");
 
       add_define_attr_for_define_subst (attr_operands[1], queue);
     }
@@ -1290,7 +1290,7 @@ read_subst_mapping (htab_t subst_iters_table, htab_t subst_attrs_table,
   m = add_mapping (&substs, subst_attrs_table, attr_operands[0]);
   end_ptr = &m->values;
   end_ptr = add_map_value (end_ptr, 1, attr_operands[2]);
-  end_ptr = add_map_value (end_ptr, 2, attr_operands[3]);
+  add_map_value (end_ptr, 2, attr_operands[3]);
 }
 
 /* Check newly-created code iterator ITERATOR to see whether every code has the
diff --git a/gcc/regrename.c b/gcc/regrename.c
index 637b3cbe6d7..5259d565e58 100644
--- a/gcc/regrename.c
+++ b/gcc/regrename.c
@@ -1426,10 +1426,9 @@ scan_rtx (rtx_insn *insn, rtx *loc, enum reg_class cl, enum scan_actions action,
 {
   const char *fmt;
   rtx x = *loc;
-  enum rtx_code code = GET_CODE (x);
   int i, j;
 
-  code = GET_CODE (x);
+  enum rtx_code code = GET_CODE (x);
   switch (code)
     {
     case CONST:
diff --git a/gcc/reorg.c b/gcc/reorg.c
index 81349382b81..bdfcf8851cd 100644
--- a/gcc/reorg.c
+++ b/gcc/reorg.c
@@ -2708,14 +2708,13 @@ fill_slots_from_thread (rtx_jump_insn *insn, rtx condition,
       && GET_CODE (PATTERN (new_thread)) != ASM_INPUT
       && asm_noperands (PATTERN (new_thread)) < 0)
     {
-      rtx pat = PATTERN (new_thread);
       rtx dest;
       rtx src;
 
       /* We know "new_thread" is an insn due to NONJUMP_INSN_P (new_thread)
 	 above.  */
       trial = as_a <rtx_insn *> (new_thread);
-      pat = PATTERN (trial);
+      rtx pat = PATTERN (trial);
 
       if (!NONJUMP_INSN_P (trial)
 	  || GET_CODE (pat) != SET
diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
index 50bbb79655b..bdbd1b98eba 100644
--- a/gcc/simplify-rtx.c
+++ b/gcc/simplify-rtx.c
@@ -6709,6 +6709,17 @@ simplify_subreg (machine_mode outermode, rtx op,
 	}
     }
 
+  /* If OP is a vector comparison and the subreg is not changing the
+     number of elements or the size of the elements, change the result
+     of the comparison to the new mode.  */
+  if (COMPARISON_P (op)
+      && VECTOR_MODE_P (outermode)
+      && VECTOR_MODE_P (innermode)
+      && known_eq (GET_MODE_NUNITS (outermode), GET_MODE_NUNITS (innermode))
+      && known_eq (GET_MODE_UNIT_SIZE (outermode),
+		    GET_MODE_UNIT_SIZE (innermode)))
+    return simplify_gen_relational (GET_CODE (op), outermode, innermode,
+				    XEXP (op, 0), XEXP (op, 1));
   return NULL_RTX;
 }
 
diff --git a/gcc/stor-layout.c b/gcc/stor-layout.c
index 5d6f2e0166c..a054b7887e7 100644
--- a/gcc/stor-layout.c
+++ b/gcc/stor-layout.c
@@ -514,18 +514,43 @@ mode_for_vector (scalar_mode innermode, poly_uint64 nunits)
   return opt_machine_mode ();
 }
 
-/* Return the mode for a vector that has NUNITS integer elements of
-   INT_BITS bits each, if such a mode exists.  The mode can be either
-   an integer mode or a vector mode.  */
+/* If a piece of code is using vector mode VECTOR_MODE and also wants
+   to operate on elements of mode ELEMENT_MODE, return the vector mode
+   it should use for those elements.  If NUNITS is nonzero, ensure that
+   the mode has exactly NUNITS elements, otherwise pick whichever vector
+   size pairs the most naturally with VECTOR_MODE; this may mean choosing
+   a mode with a different size and/or number of elements, depending on
+   what the target prefers.  Return an empty opt_machine_mode if there
+   is no supported vector mode with the required properties.
+
+   Unlike mode_for_vector. any returned mode is guaranteed to satisfy
+   both VECTOR_MODE_P and targetm.vector_mode_supported_p.  */
 
 opt_machine_mode
-mode_for_int_vector (unsigned int int_bits, poly_uint64 nunits)
+related_vector_mode (machine_mode vector_mode, scalar_mode element_mode,
+		     poly_uint64 nunits)
 {
+  gcc_assert (VECTOR_MODE_P (vector_mode));
+  return targetm.vectorize.related_mode (vector_mode, element_mode, nunits);
+}
+
+/* If a piece of code is using vector mode VECTOR_MODE and also wants
+   to operate on integer vectors with the same element size and number
+   of elements, return the vector mode it should use.  Return an empty
+   opt_machine_mode if there is no supported vector mode with the
+   required properties.
+
+   Unlike mode_for_vector. any returned mode is guaranteed to satisfy
+   both VECTOR_MODE_P and targetm.vector_mode_supported_p.  */
+
+opt_machine_mode
+related_int_vector_mode (machine_mode vector_mode)
+{
+  gcc_assert (VECTOR_MODE_P (vector_mode));
   scalar_int_mode int_mode;
-  machine_mode vec_mode;
-  if (int_mode_for_size (int_bits, 0).exists (&int_mode)
-      && mode_for_vector (int_mode, nunits).exists (&vec_mode))
-    return vec_mode;
+  if (int_mode_for_mode (GET_MODE_INNER (vector_mode)).exists (&int_mode))
+    return related_vector_mode (vector_mode, int_mode,
+				GET_MODE_NUNITS (vector_mode));
   return opt_machine_mode ();
 }
 
diff --git a/gcc/target.def b/gcc/target.def
index 66cee075018..f998470fffd 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1894,33 +1894,80 @@ reached.  The default is @var{mode} which means no splitting.",
 /* Returns a mask of vector sizes to iterate over when auto-vectorizing
    after processing the preferred one derived from preferred_simd_mode.  */
 DEFHOOK
-(autovectorize_vector_sizes,
- "If the mode returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is not\n\
-the only one that is worth considering, this hook should add all suitable\n\
-vector sizes to @var{sizes}, in order of decreasing preference.  The first\n\
-one should be the size of @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE}.\n\
+(autovectorize_vector_modes,
+ "If using the mode returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE}\n\
+is not the only approach worth considering, this hook should add one mode to\n\
+@var{modes} for each useful alternative approach.  These modes are then\n\
+passed to @code{TARGET_VECTORIZE_RELATED_MODE} to obtain the vector mode\n\
+for a given element mode.\n\
+\n\
+The modes returned in @var{modes} should use the smallest element mode\n\
+possible for the vectorization approach that they represent, preferring\n\
+integer modes over floating-poing modes in the event of a tie.  The first\n\
+mode should be the @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} for its\n\
+element mode.\n\
+\n\
+If @var{all} is true, add suitable vector modes even when they are generally\n\
+not expected to be worthwhile.\n\
+\n\
+The hook returns a bitmask of flags that control how the modes in\n\
+@var{modes} are used.  The flags are:\n\
+@table @code\n\
+@item VECT_COMPARE_COSTS\n\
+Tells the loop vectorizer to try all the provided modes and pick the one\n\
+with the lowest cost.  By default the vectorizer will choose the first\n\
+mode that works.\n\
+@end table\n\
 \n\
 The hook does not need to do anything if the vector returned by\n\
 @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is the only one relevant\n\
-for autovectorization.  The default implementation does nothing.",
- void,
- (vector_sizes *sizes),
- default_autovectorize_vector_sizes)
+for autovectorization.  The default implementation adds no modes and\n\
+returns 0.",
+ unsigned int,
+ (vector_modes *modes, bool all),
+ default_autovectorize_vector_modes)
+
+DEFHOOK
+(related_mode,
+ "If a piece of code is using vector mode @var{vector_mode} and also wants\n\
+to operate on elements of mode @var{element_mode}, return the vector mode\n\
+it should use for those elements.  If @var{nunits} is nonzero, ensure that\n\
+the mode has exactly @var{nunits} elements, otherwise pick whichever vector\n\
+size pairs the most naturally with @var{vector_mode}.  Return an empty\n\
+@code{opt_machine_mode} if there is no supported vector mode with the\n\
+required properties.\n\
+\n\
+There is no prescribed way of handling the case in which @var{nunits}\n\
+is zero.  One common choice is to pick a vector mode with the same size\n\
+as @var{vector_mode}; this is the natural choice if the target has a\n\
+fixed vector size.  Another option is to choose a vector mode with the\n\
+same number of elements as @var{vector_mode}; this is the natural choice\n\
+if the target has a fixed number of elements.  Alternatively, the hook\n\
+might choose a middle ground, such as trying to keep the number of\n\
+elements as similar as possible while applying maximum and minimum\n\
+vector sizes.\n\
+\n\
+The default implementation uses @code{mode_for_vector} to find the\n\
+requested mode, returning a mode with the same size as @var{vector_mode}\n\
+when @var{nunits} is zero.  This is the correct behavior for most targets.",
+ opt_machine_mode,
+ (machine_mode vector_mode, scalar_mode element_mode, poly_uint64 nunits),
+ default_vectorize_related_mode)
 
 /* Function to get a target mode for a vector mask.  */
 DEFHOOK
 (get_mask_mode,
- "A vector mask is a value that holds one boolean result for every element\n\
-in a vector.  This hook returns the machine mode that should be used to\n\
-represent such a mask when the vector in question is @var{length} bytes\n\
-long and contains @var{nunits} elements.  The hook returns an empty\n\
-@code{opt_machine_mode} if no such mode exists.\n\
-\n\
-The default implementation returns the mode of an integer vector that\n\
-is @var{length} bytes long and that contains @var{nunits} elements,\n\
-if such a mode exists.",
+ "Return the mode to use for a vector mask that holds one boolean\n\
+result for each element of vector mode @var{mode}.  The returned mask mode\n\
+can be a vector of integers (class @code{MODE_VECTOR_INT}), a vector of\n\
+booleans (class @code{MODE_VECTOR_BOOL}) or a scalar integer (class\n\
+@code{MODE_INT}).  Return an empty @code{opt_machine_mode} if no such\n\
+mask mode exists.\n\
+\n\
+The default implementation returns a @code{MODE_VECTOR_INT} with the\n\
+same size and number of elements as @var{mode}, if such a mode exists.",
  opt_machine_mode,
- (poly_uint64 nunits, poly_uint64 length),
+ (machine_mode mode),
  default_get_mask_mode)
 
 /* Function to say whether a masked operation is expensive when the
diff --git a/gcc/target.h b/gcc/target.h
index 008932b5dbd..057e6ae8768 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -199,11 +199,19 @@ enum vect_cost_model_location {
 class vec_perm_indices;
 
 /* The type to use for lists of vector sizes.  */
-typedef vec<poly_uint64> vector_sizes;
+typedef vec<machine_mode> vector_modes;
 
 /* Same, but can be used to construct local lists that are
    automatically freed.  */
-typedef auto_vec<poly_uint64, 8> auto_vector_sizes;
+typedef auto_vec<machine_mode, 8> auto_vector_modes;
+
+/* Flags returned by TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES:
+
+   VECT_COMPARE_COSTS
+       Tells the loop vectorizer to try all the provided modes and
+       pick the one with the lowest cost.  By default the vectorizer
+       will choose the first mode that works.  */
+const unsigned int VECT_COMPARE_COSTS = 1U << 0;
 
 /* The target structure.  This holds all the backend hooks.  */
 #define DEFHOOKPOD(NAME, DOC, TYPE, INIT) TYPE NAME;
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index 02b9dc59611..6396f6f4bdf 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -1312,32 +1312,39 @@ default_split_reduction (machine_mode mode)
   return mode;
 }
 
-/* By default only the size derived from the preferred vector mode
-   is tried.  */
+/* By default only the preferred vector mode is tried.  */
 
-void
-default_autovectorize_vector_sizes (vector_sizes *)
+unsigned int
+default_autovectorize_vector_modes (vector_modes *, bool)
 {
+  return 0;
 }
 
-/* By default a vector of integers is used as a mask.  */
+/* The default implementation of TARGET_VECTORIZE_RELATED_MODE.  */
 
 opt_machine_mode
-default_get_mask_mode (poly_uint64 nunits, poly_uint64 vector_size)
-{
-  unsigned int elem_size = vector_element_size (vector_size, nunits);
-  scalar_int_mode elem_mode
-    = smallest_int_mode_for_size (elem_size * BITS_PER_UNIT);
-  machine_mode vector_mode;
+default_vectorize_related_mode (machine_mode vector_mode,
+				scalar_mode element_mode,
+				poly_uint64 nunits)
+{
+  machine_mode result_mode;
+  if ((maybe_ne (nunits, 0U)
+       || multiple_p (GET_MODE_SIZE (vector_mode),
+		      GET_MODE_SIZE (element_mode), &nunits))
+      && mode_for_vector (element_mode, nunits).exists (&result_mode)
+      && VECTOR_MODE_P (result_mode)
+      && targetm.vector_mode_supported_p (result_mode))
+    return result_mode;
 
-  gcc_assert (known_eq (elem_size * nunits, vector_size));
+  return opt_machine_mode ();
+}
 
-  if (mode_for_vector (elem_mode, nunits).exists (&vector_mode)
-      && VECTOR_MODE_P (vector_mode)
-      && targetm.vector_mode_supported_p (vector_mode))
-    return vector_mode;
+/* By default a vector of integers is used as a mask.  */
 
-  return opt_machine_mode ();
+opt_machine_mode
+default_get_mask_mode (machine_mode mode)
+{
+  return related_int_vector_mode (mode);
 }
 
 /* By default consider masked stores to be expensive.  */
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 59436278dcf..2d599190891 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -110,8 +110,11 @@ default_builtin_support_vector_misalignment (machine_mode mode,
 					     int, bool);
 extern machine_mode default_preferred_simd_mode (scalar_mode mode);
 extern machine_mode default_split_reduction (machine_mode);
-extern void default_autovectorize_vector_sizes (vector_sizes *);
-extern opt_machine_mode default_get_mask_mode (poly_uint64, poly_uint64);
+extern unsigned int default_autovectorize_vector_modes (vector_modes *, bool);
+extern opt_machine_mode default_vectorize_related_mode (machine_mode,
+							scalar_mode,
+							poly_uint64);
+extern opt_machine_mode default_get_mask_mode (machine_mode);
 extern bool default_empty_mask_is_expensive (unsigned);
 extern void *default_init_cost (struct loop *);
 extern unsigned default_add_stmt_cost (void *, int, enum vect_cost_for_stmt,
diff --git a/gcc/testsuite/g++.dg/opt/pr92317.C b/gcc/testsuite/g++.dg/opt/pr92317.C
new file mode 100644
index 00000000000..2bb9729fc96
--- /dev/null
+++ b/gcc/testsuite/g++.dg/opt/pr92317.C
@@ -0,0 +1,51 @@
+// Copied from pr87967.C
+// { dg-do compile { target c++11 } }
+// { dg-options "-O2 -ftree-vectorize -fno-tree-pre --param vect-epilogues-nomask=1" }
+
+void h();
+template <typename b> struct k { using d = b; };
+template <typename b, template <typename> class> using e = k<b>;
+template <typename b, template <typename> class f>
+using g = typename e<b, f>::d;
+struct l {
+  template <typename i> using ab = typename i::j;
+};
+struct n : l {
+  using j = g<char *, ab>;
+};
+class o {
+public:
+  long r();
+};
+char m;
+char s() {
+  if (m)
+    return '0';
+  return 'A';
+}
+class t {
+public:
+  typedef char *ad;
+  ad m_fn2();
+};
+void fn3() {
+  char *a;
+  t b;
+  bool p = false;
+  while (*a) {
+    h();
+    o c;
+    if (*a)
+      a++;
+    if (c.r()) {
+      n::j q;
+      for (t::ad d = b.m_fn2(), e; d != e; d++) {
+        char f = *q;
+        *d = f + s();
+      }
+      p = true;
+    }
+  }
+  if (p)
+    throw;
+}
diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr90883.C b/gcc/testsuite/g++.dg/tree-ssa/pr90883.C
new file mode 100644
index 00000000000..0e622f263d2
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr90883.C
@@ -0,0 +1,20 @@
+// { dg-options "-O2 -Os -fdump-tree-dse-details -std=c++11 --param max-inline-insns-size=1" }
+
+
+    class C
+    {
+        char a[7]{};
+        int b{};
+    };
+
+    C slow()
+    {
+        return {};
+    }
+
+
+// We want to match enough here to capture that we deleted an empty
+// constructor store
+// aarch64 and mips will expand to loop to clear because CLEAR_RATIO.
+// { dg-final { scan-tree-dump "Deleted redundant store: .*\.a = {}" "dse1" { xfail { aarch64-*-* mips*-*-* } } } }
+
diff --git a/gcc/testsuite/gcc.dg/pr92162.c b/gcc/testsuite/gcc.dg/pr92162.c
new file mode 100644
index 00000000000..ed82595a752
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr92162.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast" } */
+
+short int s8;
+
+void __attribute__ ((simd))
+gn (void)
+{
+  s8 = 0;
+}
diff --git a/gcc/testsuite/gcc.dg/torture/pr91896.c b/gcc/testsuite/gcc.dg/torture/pr91896.c
new file mode 100644
index 00000000000..e728538bb9a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr91896.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-ftree-vectorize" } */
+
+unsigned int
+zj (unsigned int et)
+{
+  signed char jr = 0;
+
+  do {
+    et *= 3;
+    jr += 2;
+  } while (jr >= 0);
+
+  if (et == (unsigned int) jr)
+    et = 0;
+
+  return et;
+}
diff --git a/gcc/testsuite/gcc.dg/torture/pr92069.c b/gcc/testsuite/gcc.dg/torture/pr92069.c
new file mode 100644
index 00000000000..806ff5fba14
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr92069.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-ftree-vectorize" } */
+
+unsigned int a, c, d;
+double b;
+void e()
+{
+  for (; d; d++)
+    {
+      double f;
+      a = 2;
+      for (; a; a++)
+	{
+	  c = b;
+	  b = f;
+	  f = c;
+	}
+    }
+}
diff --git a/gcc/testsuite/gcc.dg/torture/pr92173.c b/gcc/testsuite/gcc.dg/torture/pr92173.c
new file mode 100644
index 00000000000..fcb3548b716
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr92173.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-ftree-vectorize" } */
+
+unsigned int
+yo (unsigned int o0, signed char s1)
+{
+  for (s1 = 0; s1 < 1; s1 -= 2)
+    o0 += o0;
+
+  return o0 + s1;
+}
diff --git a/gcc/testsuite/gcc.dg/torture/pr92241.c b/gcc/testsuite/gcc.dg/torture/pr92241.c
new file mode 100644
index 00000000000..331d03b3d44
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr92241.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-ftree-vectorize" } */
+
+int a, b;
+char c[2];
+void d() {
+  char e;
+  for (; b; b--) {
+    e = 0;
+    for (; e <= 1; e++)
+      a &= c[b + e] && 1;
+  }
+}
diff --git a/gcc/testsuite/gcc.dg/torture/pr92275.c b/gcc/testsuite/gcc.dg/torture/pr92275.c
new file mode 100644
index 00000000000..b9f70889758
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr92275.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-ftree-vectorize" } */
+
+unsigned long a, c;
+int *b, *b2;
+long d;
+
+void fn1()
+{
+  for (; b < b2; b++)
+    d += *b * c;
+  d *= a;
+}
diff --git a/gcc/testsuite/gcc.dg/torture/pr92371.c b/gcc/testsuite/gcc.dg/torture/pr92371.c
new file mode 100644
index 00000000000..0c78d32f471
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr92371.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-ftree-vectorize" } */
+
+int a, b;
+void d()
+{
+  int c = sizeof(int);
+  for (; a; a++)
+    c *= sizeof(int);
+  c *= sizeof(int);
+  b = c;
+}
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-36.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-36.c
new file mode 100644
index 00000000000..23a53bb4ad2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-36.c
@@ -0,0 +1,65 @@
+/* { dg-options "-O2 -fdump-tree-dse-details -fno-tree-fre" } */
+#include <string.h>
+#include <stdlib.h>
+
+struct X
+{
+  char mem0[10];
+  char mem1[10];
+};
+
+
+void blah (struct X);
+
+
+void
+foo1()
+{
+  struct X x = { };
+  memset (x.mem1, 0, sizeof x.mem1);
+  blah (x);
+}
+
+void
+foo2()
+{
+  struct X x = { };
+  x.mem1[5] = 0;
+  blah (x);
+}
+
+void
+bar1 ()
+{
+  struct X x;
+  memset (&x, 0, sizeof x);
+  memset (&x.mem1, 0, sizeof x.mem1);
+  blah (x);
+}
+void
+bar2 ()
+{
+  struct X x;
+  memset (&x, 0, sizeof x);
+  x.mem1[5] = 0;
+  blah (x);
+}
+
+void
+baz1 ()
+{
+  struct X *x = calloc (sizeof (struct X), 1);
+  memset (&x->mem1, 0, sizeof x->mem1);
+  blah (*x);
+}
+
+void
+baz2 ()
+{
+  struct X *x = calloc (sizeof (struct X), 1);
+  x->mem1[5] = 0;
+  blah (*x);
+}
+/* { dg-final { scan-tree-dump-times "Deleted redundant call" 3 "dse1" } } */
+/* { dg-final { scan-tree-dump-times "Deleted redundant store" 3 "dse1" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-div-2.c b/gcc/testsuite/gcc.dg/vect/bb-slp-div-2.c
new file mode 100644
index 00000000000..715c22ac6c6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-div-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+
+int x[4], y[4], z[4];
+
+void
+f (void)
+{
+  x[0] += y[0] / z[0] * 2;
+  x[1] += y[1] / z[1] * 2;
+  x[2] += y[2] / z[2] * 2;
+  x[3] += y[3] / z[3] * 2;
+}
+
+/* { dg-final { scan-tree-dump "basic block vectorized" "slp2" { target vect_int } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr69907.c b/gcc/testsuite/gcc.dg/vect/bb-slp-pr69907.c
index 85f9a02582f..813b1af089a 100644
--- a/gcc/testsuite/gcc.dg/vect/bb-slp-pr69907.c
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr69907.c
@@ -18,5 +18,6 @@ void foo(unsigned *p1, unsigned short *p2)
 }
 
 /* Disable for SVE because for long or variable-length vectors we don't
-   get an unrolled epilogue loop.  */
-/* { dg-final { scan-tree-dump "BB vectorization with gaps at the end of a load is not supported" "slp1" { target { ! aarch64_sve } } } } */
+   get an unrolled epilogue loop.  Also disable for AArch64 Advanced SIMD,
+   because there we can vectorize the epilogue using mixed vector sizes.  */
+/* { dg-final { scan-tree-dump "BB vectorization with gaps at the end of a load is not supported" "slp1" { target { ! aarch64*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/fast-math-vect-call-1.c b/gcc/testsuite/gcc.dg/vect/fast-math-vect-call-1.c
index 228190ab05d..877de4eb5be 100644
--- a/gcc/testsuite/gcc.dg/vect/fast-math-vect-call-1.c
+++ b/gcc/testsuite/gcc.dg/vect/fast-math-vect-call-1.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 #include "tree-vect.h"
 
 extern float copysignf (float, float);
diff --git a/gcc/testsuite/gcc.dg/vect/no-fast-math-vect16.c b/gcc/testsuite/gcc.dg/vect/no-fast-math-vect16.c
index 7a148e41d51..5f871289337 100644
--- a/gcc/testsuite/gcc.dg/vect/no-fast-math-vect16.c
+++ b/gcc/testsuite/gcc.dg/vect/no-fast-math-vect16.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_float_strict } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/no-scevccp-noreassoc-slp-reduc-7.c b/gcc/testsuite/gcc.dg/vect/no-scevccp-noreassoc-slp-reduc-7.c
index 1d674504e2c..022d49f1175 100644
--- a/gcc/testsuite/gcc.dg/vect/no-scevccp-noreassoc-slp-reduc-7.c
+++ b/gcc/testsuite/gcc.dg/vect/no-scevccp-noreassoc-slp-reduc-7.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-12.c b/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-12.c
index e4202b10d06..b5f8c3c88e4 100644
--- a/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-12.c
+++ b/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-12.c
@@ -46,4 +46,4 @@ int main (void)
 }
 
 /* Until we support multiple types in the inner loop  */
-/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED." 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED." 1 "vect" { xfail { ! aarch64*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/no-scevccp-vect-iv-3.c b/gcc/testsuite/gcc.dg/vect/no-scevccp-vect-iv-3.c
index 50b4998bb6c..7049e4936b9 100644
--- a/gcc/testsuite/gcc.dg/vect/no-scevccp-vect-iv-3.c
+++ b/gcc/testsuite/gcc.dg/vect/no-scevccp-vect-iv-3.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-require-effective-target vect_int } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-31.c b/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-31.c
index c3b242157ce..d2ae7976781 100644
--- a/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-31.c
+++ b/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-31.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-add-options bind_pic_locally } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-64.c b/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-64.c
index 470bbfb5537..243e01e6dad 100644
--- a/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-64.c
+++ b/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-64.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-add-options bind_pic_locally } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-66.c b/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-66.c
index 805024d8058..e339590bacb 100644
--- a/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-66.c
+++ b/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-66.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-68.c b/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-68.c
index 726c0de652f..c403a8302d8 100644
--- a/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-68.c
+++ b/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-68.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-skip-if "AArch64 tiny code model does not support programs larger than 1MiB" {aarch64_tiny} } */
 /* { dg-add-options bind_pic_locally } */
diff --git a/gcc/testsuite/gcc.dg/vect/no-vfa-vect-dv-2.c b/gcc/testsuite/gcc.dg/vect/no-vfa-vect-dv-2.c
index 4513c40b34f..dcb53701795 100644
--- a/gcc/testsuite/gcc.dg/vect/no-vfa-vect-dv-2.c
+++ b/gcc/testsuite/gcc.dg/vect/no-vfa-vect-dv-2.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/pr33804.c b/gcc/testsuite/gcc.dg/vect/pr33804.c
index 86babbe60e7..0db13674b42 100644
--- a/gcc/testsuite/gcc.dg/vect/pr33804.c
+++ b/gcc/testsuite/gcc.dg/vect/pr33804.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-require-effective-target vect_int } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/pr53773.c b/gcc/testsuite/gcc.dg/vect/pr53773.c
index 0bcc021767e..7f8229571ec 100644
--- a/gcc/testsuite/gcc.dg/vect/pr53773.c
+++ b/gcc/testsuite/gcc.dg/vect/pr53773.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-additional-options "-fdump-tree-optimized" } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/pr65930-1.c b/gcc/testsuite/gcc.dg/vect/pr65930-1.c
new file mode 100644
index 00000000000..895fbf8869d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr65930-1.c
@@ -0,0 +1,26 @@
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+
+unsigned __attribute__((noipa))
+bar (unsigned int *x)
+{
+  int sum = 4;
+  x = __builtin_assume_aligned (x, __BIGGEST_ALIGNMENT__);
+  for (int i = 0; i < 16; ++i)
+    sum += x[i];
+  return sum;
+}
+
+int
+main()
+{
+  static int a[16] __attribute__((aligned(__BIGGEST_ALIGNMENT__)))
+    = { 1, 3, 5, 8, 9, 10, 17, 18, 23, 29, 30, 55, 42, 2, 3, 1 };
+  check_vect ();
+  if (bar (a) != 260)
+    abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr65930-2.c b/gcc/testsuite/gcc.dg/vect/pr65930-2.c
new file mode 100644
index 00000000000..9cfb9b102d9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr65930-2.c
@@ -0,0 +1,28 @@
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+
+int __attribute__((noipa))
+bar (unsigned int *x, int n)
+{
+  int sum = 4;
+  x = __builtin_assume_aligned (x, __BIGGEST_ALIGNMENT__);
+  for (int i = 0; i < n; ++i)
+    sum += x[i*4+0]+ x[i*4 + 1] + x[i*4 + 2] + x[i*4 + 3];
+  return sum;
+}
+
+int
+main ()
+{
+  static int a[16] __attribute__((aligned(__BIGGEST_ALIGNMENT__)))
+    = { 1, 3, 5, 8, 9, 10, 17, 18, 23, 29, 30, 55, 42, 2, 3, 1 };
+  check_vect ();
+  if (bar (a, 4) != 260)
+    abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */
+/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-1.c b/gcc/testsuite/gcc.dg/vect/pr65947-1.c
index 879819d576a..9fc74a1ab28 100644
--- a/gcc/testsuite/gcc.dg/vect/pr65947-1.c
+++ b/gcc/testsuite/gcc.dg/vect/pr65947-1.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_condition } */
 
 #include "tree-vect.h"
@@ -41,5 +43,5 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 4 "vect" { target vect_fold_extract_last } } } */
-/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 4 "vect" { target { ! vect_fold_extract_last } } } } */
+/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */
+/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 2 "vect" { target { ! vect_fold_extract_last } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-10.c b/gcc/testsuite/gcc.dg/vect/pr65947-10.c
index f37aecab082..e4a1d9419c2 100644
--- a/gcc/testsuite/gcc.dg/vect/pr65947-10.c
+++ b/gcc/testsuite/gcc.dg/vect/pr65947-10.c
@@ -42,6 +42,6 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 4 "vect" { target vect_fold_extract_last } } } */
+/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */
 /* { dg-final { scan-tree-dump-not "condition expression based on integer induction." "vect" } } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-12.c b/gcc/testsuite/gcc.dg/vect/pr65947-12.c
index b84fd41bc63..a47f4146a29 100644
--- a/gcc/testsuite/gcc.dg/vect/pr65947-12.c
+++ b/gcc/testsuite/gcc.dg/vect/pr65947-12.c
@@ -42,5 +42,5 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 4 "vect" { target vect_fold_extract_last } } } */
+/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */
 /* { dg-final { scan-tree-dump-not "condition expression based on integer induction." "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-13.c b/gcc/testsuite/gcc.dg/vect/pr65947-13.c
index e1d3ff52f5c..a703923151d 100644
--- a/gcc/testsuite/gcc.dg/vect/pr65947-13.c
+++ b/gcc/testsuite/gcc.dg/vect/pr65947-13.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_condition } */
 
 #include "tree-vect.h"
@@ -41,5 +43,5 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
-/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 4 "vect" { xfail vect_fold_extract_last } } } */
-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 4 "vect" { target vect_fold_extract_last } } } */
+/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 2 "vect" { xfail vect_fold_extract_last } } } */
+/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-14.c b/gcc/testsuite/gcc.dg/vect/pr65947-14.c
index 9f1e4e1eb6a..3b76fda2122 100644
--- a/gcc/testsuite/gcc.dg/vect/pr65947-14.c
+++ b/gcc/testsuite/gcc.dg/vect/pr65947-14.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_condition } */
 
 #include "tree-vect.h"
@@ -41,5 +43,5 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 4 "vect" { target vect_fold_extract_last } } } */
-/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 4 "vect" { target { ! vect_fold_extract_last } } } } */
+/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */
+/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 2 "vect" { target { ! vect_fold_extract_last } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-2.c b/gcc/testsuite/gcc.dg/vect/pr65947-2.c
index 18d33c436a5..58ba5f764d0 100644
--- a/gcc/testsuite/gcc.dg/vect/pr65947-2.c
+++ b/gcc/testsuite/gcc.dg/vect/pr65947-2.c
@@ -42,5 +42,5 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 4 "vect" { target vect_fold_extract_last } } } */
+/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */
 /* { dg-final { scan-tree-dump-not "condition expression based on integer induction." "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-3.c b/gcc/testsuite/gcc.dg/vect/pr65947-3.c
index 427abdb4140..6b4077e1a62 100644
--- a/gcc/testsuite/gcc.dg/vect/pr65947-3.c
+++ b/gcc/testsuite/gcc.dg/vect/pr65947-3.c
@@ -52,5 +52,5 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 4 "vect" { target vect_fold_extract_last } } } */
+/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */
 /* { dg-final { scan-tree-dump-not "condition expression based on integer induction." "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-4.c b/gcc/testsuite/gcc.dg/vect/pr65947-4.c
index 186e03a6346..471fbe2da21 100644
--- a/gcc/testsuite/gcc.dg/vect/pr65947-4.c
+++ b/gcc/testsuite/gcc.dg/vect/pr65947-4.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_condition } */
 
 #include "tree-vect.h"
@@ -41,6 +43,6 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 4 "vect" { target vect_fold_extract_last } } } */
-/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 4 "vect" { target { ! vect_fold_extract_last } } } } */
+/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */
+/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 2 "vect" { target { ! vect_fold_extract_last } } } } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-5.c b/gcc/testsuite/gcc.dg/vect/pr65947-5.c
index c91b648aa05..4e3f765cd0c 100644
--- a/gcc/testsuite/gcc.dg/vect/pr65947-5.c
+++ b/gcc/testsuite/gcc.dg/vect/pr65947-5.c
@@ -53,5 +53,5 @@ main (void)
 /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 1 "vect" { target { ! vect_fold_extract_last } } } } */
 /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" { target vect_fold_extract_last } } } */
 /* { dg-final { scan-tree-dump "loop size is greater than data size" "vect" { xfail vect_fold_extract_last } } } */
-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 4 "vect" { target vect_fold_extract_last } } } */
+/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */
 /* { dg-final { scan-tree-dump-not "condition expression based on integer induction." "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-6.c b/gcc/testsuite/gcc.dg/vect/pr65947-6.c
index b072c8d33a2..dde96d7a553 100644
--- a/gcc/testsuite/gcc.dg/vect/pr65947-6.c
+++ b/gcc/testsuite/gcc.dg/vect/pr65947-6.c
@@ -41,5 +41,5 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 4 "vect" { target vect_fold_extract_last } } } */
+/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */
 /* { dg-final { scan-tree-dump-not "condition expression based on integer induction." "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr65947-9.c b/gcc/testsuite/gcc.dg/vect/pr65947-9.c
index e43e0e473be..1f295306016 100644
--- a/gcc/testsuite/gcc.dg/vect/pr65947-9.c
+++ b/gcc/testsuite/gcc.dg/vect/pr65947-9.c
@@ -48,5 +48,5 @@ main ()
 /* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" { target { ! vect_fold_extract_last } } } } */
 /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 1 "vect" { target vect_fold_extract_last } } } */
 /* { dg-final { scan-tree-dump "loop size is greater than data size" "vect" { target { ! vect_fold_extract_last } } } } */
-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */
+/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 1 "vect" { target vect_fold_extract_last } } } */
 /* { dg-final { scan-tree-dump-not "condition expression based on integer induction." "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr80631-1.c b/gcc/testsuite/gcc.dg/vect/pr80631-1.c
index f2405198a10..cbb9a6ff69a 100644
--- a/gcc/testsuite/gcc.dg/vect/pr80631-1.c
+++ b/gcc/testsuite/gcc.dg/vect/pr80631-1.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* PR tree-optimization/80631 */
 
 #include "tree-vect.h"
@@ -72,5 +74,5 @@ main ()
 }
 
 /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 5 "vect" { target vect_condition } } } */
-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 10 "vect" { target vect_fold_extract_last } } } */
-/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 10 "vect" { target { { ! vect_fold_extract_last } && vect_condition } } } } */
+/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 5 "vect" { target vect_fold_extract_last } } } */
+/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 5 "vect" { target { { ! vect_fold_extract_last } && vect_condition } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr80631-2.c b/gcc/testsuite/gcc.dg/vect/pr80631-2.c
index b334ca2345b..61e11316af2 100644
--- a/gcc/testsuite/gcc.dg/vect/pr80631-2.c
+++ b/gcc/testsuite/gcc.dg/vect/pr80631-2.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* PR tree-optimization/80631 */
 
 #include "tree-vect.h"
@@ -72,5 +74,5 @@ main ()
 }
 
 /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 5 "vect" { target vect_condition } } } */
-/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 10 "vect" { target vect_condition xfail vect_fold_extract_last } } } */
-/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 10 "vect" { target vect_fold_extract_last } } } */
+/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 5 "vect" { target vect_condition xfail vect_fold_extract_last } } } */
+/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 5 "vect" { target vect_fold_extract_last } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr92205.c b/gcc/testsuite/gcc.dg/vect/pr92205.c
new file mode 100644
index 00000000000..a031c1fe297
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr92205.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+
+int b(int n, unsigned char *a)
+{
+  int d = 0;
+  a = __builtin_assume_aligned (a, __BIGGEST_ALIGNMENT__);
+  for (int c = 0; c < n; ++c)
+    d |= a[c];
+  return d;
+}
+
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { xfail *-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-23.c b/gcc/testsuite/gcc.dg/vect/slp-23.c
index 7d330c787d1..d7c67fe2c6e 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-23.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-23.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/slp-25.c b/gcc/testsuite/gcc.dg/vect/slp-25.c
index ff7eff202cb..1c33927c434 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-25.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-25.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-add-options bind_pic_locally } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/slp-9.c b/gcc/testsuite/gcc.dg/vect/slp-9.c
index d0c94f1986b..d5212dca3dd 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-9.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-9.c
@@ -44,5 +44,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_widen_mult_hi_to_si } } }*/
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_widen_mult_hi_to_si } } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target vect_widen_mult_hi_to_si } } } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c
index 07c96c00eb0..15dd59922fc 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c
index fc689e46ba1..f457c11aa3c 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c
index 88591c5bdcb..1fd15aa3c87 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-sad-2.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-sad-2.c
new file mode 100644
index 00000000000..7d9255e48f2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-sad-2.c
@@ -0,0 +1,31 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_usad_char } */
+/* With AVX256 or more we do not pull off the trick eliding the epilogue.  */
+/* { dg-additional-options "-mprefer-avx128" { target { x86_64-*-* i?86-*-* } } } */
+
+typedef unsigned char uint8_t;
+int x264_pixel_sad_8x8( uint8_t *pix1, uint8_t *pix2, int i_stride_pix2 )
+{
+  int i_sum = 0;
+  for( int y = 0; y < 8; y++ )
+    {
+      i_sum += __builtin_abs( pix1[0] - pix2[0] );
+      i_sum += __builtin_abs( pix1[1] - pix2[1] );
+      i_sum += __builtin_abs( pix1[2] - pix2[2] );
+      i_sum += __builtin_abs( pix1[3] - pix2[3] );
+      i_sum += __builtin_abs( pix1[4] - pix2[4] );
+      i_sum += __builtin_abs( pix1[5] - pix2[5] );
+      i_sum += __builtin_abs( pix1[6] - pix2[6] );
+      i_sum += __builtin_abs( pix1[7] - pix2[7] );
+      pix1 += 16;
+      pix2 += i_stride_pix2;
+    }
+  return i_sum;
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_sad_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-not "access with gaps requires scalar epilogue loop" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-widen-mult-half.c b/gcc/testsuite/gcc.dg/vect/slp-widen-mult-half.c
index f5fb63e19f1..e3bfee33348 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-widen-mult-half.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-widen-mult-half.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.dg/vect/slp-widen-mult-s16.c b/gcc/testsuite/gcc.dg/vect/slp-widen-mult-s16.c
index 4460d59b5a1..abb10fde45b 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-widen-mult-s16.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-widen-mult-s16.c
@@ -38,5 +38,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_widen_mult_hi_to_si || vect_unpack } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_widen_mult_hi_to_si || vect_unpack } } } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target { vect_widen_mult_hi_to_si || vect_unpack } } } } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/slp-widen-mult-u8.c b/gcc/testsuite/gcc.dg/vect/slp-widen-mult-u8.c
index 6e72c4878c2..0756119afb4 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-widen-mult-u8.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-widen-mult-u8.c
@@ -38,5 +38,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_widen_mult_qi_to_hi || vect_unpack } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_widen_mult_hi_to_si || vect_unpack } } } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target { vect_widen_mult_hi_to_si || vect_unpack } } } } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/trapv-vect-reduc-4.c b/gcc/testsuite/gcc.dg/vect/trapv-vect-reduc-4.c
index 8a57eb69a91..f09c964fdc1 100644
--- a/gcc/testsuite/gcc.dg/vect/trapv-vect-reduc-4.c
+++ b/gcc/testsuite/gcc.dg/vect/trapv-vect-reduc-4.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-require-effective-target vect_int } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-103.c b/gcc/testsuite/gcc.dg/vect/vect-103.c
index 4a9e1574eb0..2a4510482d4 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-103.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-103.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdlib.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-109.c b/gcc/testsuite/gcc.dg/vect/vect-109.c
index 9a507105899..ac5d0827899 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-109.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-109.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-skip-if "" { vect_no_align } } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-add-options bind_pic_locally } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-119.c b/gcc/testsuite/gcc.dg/vect/vect-119.c
index aa8c3002bff..29a9c51cd29 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-119.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-119.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-require-effective-target vect_int } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-24.c b/gcc/testsuite/gcc.dg/vect/vect-24.c
index cbff6c55fa4..fa4c0620d29 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-24.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-24.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-26.c b/gcc/testsuite/gcc.dg/vect/vect-26.c
index 4f0472b5d0f..8a141f38400 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-26.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-26.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-27.c b/gcc/testsuite/gcc.dg/vect/vect-27.c
index 590217feee7..ac86b21aceb 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-27.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-27.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-add-options bind_pic_locally } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-29.c b/gcc/testsuite/gcc.dg/vect/vect-29.c
index 86ec2cc1ddf..bbd446dfe63 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-29.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-29.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-add-options bind_pic_locally } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-42.c b/gcc/testsuite/gcc.dg/vect/vect-42.c
index a65b4a62276..086cbf20c0a 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-42.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-42.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_float } */
 /* { dg-add-options double_vectors } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-44.c b/gcc/testsuite/gcc.dg/vect/vect-44.c
index 03ef2c0f671..f7f1fd28665 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-44.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-44.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_float } */
 /* { dg-additional-options "--param vect-max-peeling-for-alignment=0" } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-48.c b/gcc/testsuite/gcc.dg/vect/vect-48.c
index bac6ef6b8dd..b29fe47635a 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-48.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-48.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_float } */
 /* { dg-add-options double_vectors } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-50.c b/gcc/testsuite/gcc.dg/vect/vect-50.c
index c9500ca91e5..f43676896af 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-50.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-50.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_float } */
 /* { dg-additional-options "--param vect-max-peeling-for-alignment=0" } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-52.c b/gcc/testsuite/gcc.dg/vect/vect-52.c
index 0343d9a24d1..c20a4be2ede 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-52.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-52.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_float } */
 /* { dg-add-options double_vectors } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-54.c b/gcc/testsuite/gcc.dg/vect/vect-54.c
index 58201abe069..2b236e48e19 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-54.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-54.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_float } */
 /* { dg-add-options double_vectors } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-56.c b/gcc/testsuite/gcc.dg/vect/vect-56.c
index 8060b05e781..c914126ece5 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-56.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-56.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_float } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-58.c b/gcc/testsuite/gcc.dg/vect/vect-58.c
index 441af51860e..da4f9740e33 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-58.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-58.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_float } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-60.c b/gcc/testsuite/gcc.dg/vect/vect-60.c
index 3b7477c96ab..121c503c63a 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-60.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-60.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_float } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-72.c b/gcc/testsuite/gcc.dg/vect/vect-72.c
index 472d8d57549..9e8e91b7ae6 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-72.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-72.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-add-options bind_pic_locally } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-75-big-array.c b/gcc/testsuite/gcc.dg/vect/vect-75-big-array.c
index 42b2b8d91aa..a3fb5053037 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-75-big-array.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-75-big-array.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-75.c b/gcc/testsuite/gcc.dg/vect/vect-75.c
index 2cdd7032242..88da97f0bb7 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-75.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-75.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-77-alignchecks.c b/gcc/testsuite/gcc.dg/vect/vect-77-alignchecks.c
index 56ee797d10b..fb3e4992782 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-77-alignchecks.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-77-alignchecks.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-77-global.c b/gcc/testsuite/gcc.dg/vect/vect-77-global.c
index f0b73505d68..1580d6e075b 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-77-global.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-77-global.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-add-options bind_pic_locally } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-78-alignchecks.c b/gcc/testsuite/gcc.dg/vect/vect-78-alignchecks.c
index c3ef8a36591..57e8da0a909 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-78-alignchecks.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-78-alignchecks.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-78-global.c b/gcc/testsuite/gcc.dg/vect/vect-78-global.c
index 241e7fa94b5..ea039b389b2 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-78-global.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-78-global.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-add-options bind_pic_locally } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-89-big-array.c b/gcc/testsuite/gcc.dg/vect/vect-89-big-array.c
index decfbee318a..59e1aae0017 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-89-big-array.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-89-big-array.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-89.c b/gcc/testsuite/gcc.dg/vect/vect-89.c
index 051698eada2..356ab96d330 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-89.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-89.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-91.c b/gcc/testsuite/gcc.dg/vect/vect-91.c
index 9430da3290a..91264d9841d 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-91.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-91.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-additional-options "--param vect-max-peeling-for-alignment=0" } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-92.c b/gcc/testsuite/gcc.dg/vect/vect-92.c
index b9a1ce23d02..9ceb0fbadcd 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-92.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-92.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_float } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-96.c b/gcc/testsuite/gcc.dg/vect/vect-96.c
index 0cb935b9f16..c0d6c37b21d 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-96.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-96.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-add-options double_vectors } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-1.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-1.c
index c2b1c773047..3887120b747 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-alias-check-1.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-1.c
@@ -15,3 +15,5 @@ fn1 ()
 }
 
 /* { dg-final { scan-tree-dump "improved number of alias checks from \[0-9\]* to 1" "vect" } } */
+/* { dg-final { scan-tree-dump "using an address-based overlap test" "vect" } } */
+/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-10.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-10.c
index 0e6285e4a23..b6cc309dbe8 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-alias-check-10.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-10.c
@@ -65,3 +65,6 @@ main (void)
   FOR_EACH_TYPE (DO_TEST)
   return 0;
 }
+
+/* { dg-final { scan-tree-dump-not "using an address-based" "vect" } } */
+/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-11.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-11.c
index a0d5abc3aa4..09a4ebfa69e 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-alias-check-11.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-11.c
@@ -95,3 +95,6 @@ main (void)
 /* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* step[^ ]* \* 8[)]* is outside \(-24, 24\)} "vect" { target vect_double } } } */
 /* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* step[^ ]* \* 8[)]* is outside \(-32, 32\)} "vect" { target vect_double } } } */
 /* { dg-final { scan-tree-dump {run-time check [^\n]* abs \([^*]* \* 8[)]* >= 32} "vect" { target vect_double } } } */
+
+/* { dg-final { scan-tree-dump-not "using an address-based" "vect" } } */
+/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-12.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-12.c
index 788cdfc3cdc..63a897f4bad 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-alias-check-12.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-12.c
@@ -95,3 +95,6 @@ main (void)
 /* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* [_a-z][^ ]* \* 8[)]* is outside \[0, 24\)} "vect" { target vect_double } } } */
 /* { dg-final { scan-tree-dump {no alias between [^\n]* when [^\n]* [_a-z][^ ]* \* 8[)]* is outside \[0, 32\)} "vect" { target vect_double } } } */
 /* { dg-final { scan-tree-dump {run-time check [^\n]* unsigned \([^*]* \* 8[)]* >= 32} "vect" { target vect_double } } } */
+
+/* { dg-final { scan-tree-dump-not "using an address-based" "vect" } } */
+/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-13.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-13.c
index 60bc4730724..812aa9027dd 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-alias-check-13.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-13.c
@@ -18,4 +18,6 @@ f2 (int *x, long step2, int n)
 
 /* { dg-final { scan-tree-dump {need run-time check that [^\n]*step1[^\n]* is nonzero} "vect" } } */
 /* { dg-final { scan-tree-dump-not {need run-time check that [^\n]*step2[^\n]* is nonzero} "vect" } } */
+/* { dg-final { scan-tree-dump-not "using an address-based" "vect" } } */
+/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */
 /* { dg-final { scan-tree-dump-times {LOOP VECTORIZED} 2 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-14.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-14.c
new file mode 100644
index 00000000000..1d148a04918
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-14.c
@@ -0,0 +1,64 @@
+#define N 200
+#define M 4
+
+typedef signed char sc;
+typedef unsigned char uc;
+typedef signed short ss;
+typedef unsigned short us;
+typedef int si;
+typedef unsigned int ui;
+typedef signed long long sll;
+typedef unsigned long long ull;
+
+#define FOR_EACH_TYPE(M) \
+  M (sc) M (uc) \
+  M (ss) M (us) \
+  M (si) M (ui) \
+  M (sll) M (ull) \
+  M (float) M (double)
+
+#define TEST_VALUE(I) ((I) * 17 / 2)
+
+#define ADD_TEST(TYPE)				\
+  void __attribute__((noinline, noclone))	\
+  test_##TYPE (TYPE *a, TYPE *b)		\
+  {						\
+    for (int i = 0; i < N; i += 2)		\
+      {						\
+	TYPE b0 = b[i + 0];			\
+	TYPE b1 = b[i + 1];			\
+	a[i + 0] = b0 + 2;			\
+	a[i + 1] = b1 + 3;			\
+      }						\
+  }
+
+#define DO_TEST(TYPE)						\
+  for (int j = 0; j < M; ++j)					\
+    {								\
+      TYPE a[N + M];						\
+      for (int i = 0; i < N + M; ++i)				\
+	a[i] = TEST_VALUE (i);					\
+      test_##TYPE (a + j, a);					\
+      for (int i = 0; i < N; i += 2)				\
+	{							\
+	  TYPE base1 = j == 0 ? TEST_VALUE (i) : a[i];		\
+	  TYPE base2 = j <= 1 ? TEST_VALUE (i + 1) : a[i + 1];	\
+	  if (a[i + j] != (TYPE) (base1 + 2)			\
+	      || a[i + j + 1] != (TYPE) (base2 + 3))		\
+	    __builtin_abort ();					\
+	}							\
+    }
+
+FOR_EACH_TYPE (ADD_TEST)
+
+int
+main (void)
+{
+  FOR_EACH_TYPE (DO_TEST)
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump {flags: *WAR\n} "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump-not {flags: [^\n]*ARBITRARY\n} "vect" } } */
+/* { dg-final { scan-tree-dump "using an address-based WAR/WAW test" "vect" } } */
+/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-15.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-15.c
new file mode 100644
index 00000000000..fbe3f8431ff
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-15.c
@@ -0,0 +1,61 @@
+#define N 200
+#define DIST 32
+
+typedef signed char sc;
+typedef unsigned char uc;
+typedef signed short ss;
+typedef unsigned short us;
+typedef int si;
+typedef unsigned int ui;
+typedef signed long long sll;
+typedef unsigned long long ull;
+
+#define FOR_EACH_TYPE(M) \
+  M (sc) M (uc) \
+  M (ss) M (us) \
+  M (si) M (ui) \
+  M (sll) M (ull) \
+  M (float) M (double)
+
+#define ADD_TEST(TYPE)				\
+  void __attribute__((noinline, noclone))	\
+  test_##TYPE (TYPE *x, TYPE *y)		\
+  {						\
+    for (int i = 0; i < N; ++i)			\
+      {						\
+	x[i] = i;				\
+	y[i] = 42 - i * 2;			\
+      }						\
+  }
+
+#define DO_TEST(TYPE)						\
+  for (int i = 0; i < DIST * 2; ++i)				\
+    {								\
+      TYPE a[N + DIST * 2] = {};				\
+      test_##TYPE (a + DIST, a + i);				\
+      for (int j = 0; j < N + DIST * 2; ++j)			\
+	{							\
+	  TYPE expected = 0;					\
+	  if (i > DIST && j >= i && j < i + N)			\
+	    expected = 42 - (j - i) * 2;			\
+	  if (j >= DIST && j < DIST + N)			\
+	    expected = j - DIST;				\
+	  if (i <= DIST && j >= i && j < i + N)			\
+	    expected = 42 - (j - i) * 2;			\
+	  if (expected != a[j])					\
+	    __builtin_abort ();					\
+	}							\
+    }
+
+FOR_EACH_TYPE (ADD_TEST)
+
+int
+main (void)
+{
+  FOR_EACH_TYPE (DO_TEST)
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump {flags: *WAW\n} "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump "using an address-based WAR/WAW test" "vect" } } */
+/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-16.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-16.c
new file mode 100644
index 00000000000..81c252dfc23
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-16.c
@@ -0,0 +1,66 @@
+#define N 200
+#define DIST 32
+
+typedef signed char sc;
+typedef unsigned char uc;
+typedef signed short ss;
+typedef unsigned short us;
+typedef int si;
+typedef unsigned int ui;
+typedef signed long long sll;
+typedef unsigned long long ull;
+
+#define FOR_EACH_TYPE(M) \
+  M (sc) M (uc) \
+  M (ss) M (us) \
+  M (si) M (ui) \
+  M (sll) M (ull) \
+  M (float) M (double)
+
+#define TEST_VALUE(I) ((I) * 13 / 2)
+
+#define ADD_TEST(TYPE)				\
+  TYPE __attribute__((noinline, noclone))	\
+  test_##TYPE (TYPE *x, TYPE *y)		\
+  {						\
+    TYPE res = 0;				\
+    for (int i = 0; i < N; ++i)			\
+      {						\
+	x[i] = i;				\
+	res += y[i];				\
+      }						\
+    return res;					\
+  }
+
+#define DO_TEST(TYPE)						\
+  for (int i = 0; i < DIST * 2; ++i)				\
+    {								\
+      TYPE a[N + DIST * 2];					\
+      for (int j = 0; j < N + DIST * 2; ++j)			\
+	a[j] = TEST_VALUE (j);					\
+      TYPE res = test_##TYPE (a + DIST, a + i);			\
+      for (int j = 0; j < N; ++j)				\
+	if (a[j + DIST] != (TYPE) j)				\
+	  __builtin_abort ();					\
+      TYPE expected_res = 0;					\
+      for (int j = i; j < i + N; ++j)				\
+	if (i <= DIST && j >= DIST && j < DIST + N)		\
+	  expected_res += j - DIST;				\
+	else							\
+	  expected_res += TEST_VALUE (j);			\
+      if (expected_res != res)					\
+	__builtin_abort ();					\
+    }
+
+FOR_EACH_TYPE (ADD_TEST)
+
+int
+main (void)
+{
+  FOR_EACH_TYPE (DO_TEST)
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump {flags: *RAW\n} "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump "using an address-based overlap test" "vect" } } */
+/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-17.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-17.c
new file mode 100644
index 00000000000..c49c497c2d0
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-17.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_load_lanes } */
+
+struct s { int x[100]; };
+
+void
+f (struct s *s1, int a, int b)
+{
+  for (int i = 0; i < 32; ++i)
+    s1->x[a + i] = s1->x[b + i * 2] + s1->x[b + i * 3];
+}
+
+/* { dg-final { scan-tree-dump {flags: *[^\n]*MIXED_STEPS} "vect" } } */
+/* { dg-final { scan-tree-dump "using an address-based overlap test" "vect" } } */
+/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-18.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-18.c
new file mode 100644
index 00000000000..9d0739151d9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-18.c
@@ -0,0 +1,64 @@
+#define N 200
+#define DIST 32
+
+typedef signed char sc;
+typedef unsigned char uc;
+typedef signed short ss;
+typedef unsigned short us;
+typedef int si;
+typedef unsigned int ui;
+typedef signed long long sll;
+typedef unsigned long long ull;
+
+#define FOR_EACH_TYPE(M) \
+  M (sc) M (uc) \
+  M (ss) M (us) \
+  M (si) M (ui) \
+  M (sll) M (ull) \
+  M (float) M (double)
+
+#define TEST_VALUE(I) ((I) * 11 / 2)
+
+#define ADD_TEST(TYPE)				\
+  TYPE a_##TYPE[N * 2];				\
+  void __attribute__((noinline, noclone))	\
+  test_##TYPE (int x, int y)			\
+  {						\
+    for (int i = 0; i < N; ++i)			\
+      a_##TYPE[x - i] += a_##TYPE[y - i];	\
+  }
+
+#define DO_TEST(TYPE)						\
+  for (int i = 0; i < DIST * 2; ++i)				\
+    {								\
+      for (int j = 0; j < N + DIST * 2; ++j)			\
+	a_##TYPE[j] = TEST_VALUE (j);				\
+      test_##TYPE (i + N - 1, DIST + N - 1);			\
+      for (int j = 0; j < N + DIST * 2; ++j)			\
+	{							\
+	  TYPE expected;					\
+	  if (j < i || j >= i + N)				\
+	    expected = TEST_VALUE (j);				\
+	  else if (i >= DIST)					\
+	    expected = ((TYPE) TEST_VALUE (j)			\
+			+ (TYPE) TEST_VALUE (j + DIST - i));	\
+	  else							\
+	    expected = ((TYPE) TEST_VALUE (j)			\
+			+ a_##TYPE[j + DIST - i]);		\
+	  if (expected != a_##TYPE[j])				\
+	    __builtin_abort ();					\
+	}							\
+    }
+
+FOR_EACH_TYPE (ADD_TEST)
+
+int
+main (void)
+{
+  FOR_EACH_TYPE (DO_TEST)
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump {flags: *WAR\n} "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump "using an index-based WAR/WAW test" "vect" } } */
+/* { dg-final { scan-tree-dump-not "using an address-based" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-19.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-19.c
new file mode 100644
index 00000000000..7c0ff36a8c4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-19.c
@@ -0,0 +1,62 @@
+#define N 200
+#define DIST 32
+
+typedef signed char sc;
+typedef unsigned char uc;
+typedef signed short ss;
+typedef unsigned short us;
+typedef int si;
+typedef unsigned int ui;
+typedef signed long long sll;
+typedef unsigned long long ull;
+
+#define FOR_EACH_TYPE(M) \
+  M (sc) M (uc) \
+  M (ss) M (us) \
+  M (si) M (ui) \
+  M (sll) M (ull) \
+  M (float) M (double)
+
+#define ADD_TEST(TYPE)				\
+  TYPE a_##TYPE[N * 2];				\
+  void __attribute__((noinline, noclone))	\
+  test_##TYPE (int x, int y)			\
+  {						\
+    for (int i = 0; i < N; ++i)			\
+      {						\
+	a_##TYPE[i + x] = i;			\
+	a_##TYPE[i + y] = 42 - i * 2;		\
+      }						\
+  }
+
+#define DO_TEST(TYPE)						\
+  for (int i = 0; i < DIST * 2; ++i)				\
+    {								\
+      __builtin_memset (a_##TYPE, 0, sizeof (a_##TYPE));	\
+      test_##TYPE (DIST, i);					\
+      for (int j = 0; j < N + DIST * 2; ++j)			\
+	{							\
+	  TYPE expected = 0;					\
+	  if (i > DIST && j >= i && j < i + N)			\
+	    expected = 42 - (j - i) * 2;			\
+	  if (j >= DIST && j < DIST + N)			\
+	    expected = j - DIST;				\
+	  if (i <= DIST && j >= i && j < i + N)			\
+	    expected = 42 - (j - i) * 2;			\
+	  if (expected != a_##TYPE[j])				\
+	    __builtin_abort ();					\
+	}							\
+    }
+
+FOR_EACH_TYPE (ADD_TEST)
+
+int
+main (void)
+{
+  FOR_EACH_TYPE (DO_TEST)
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump {flags: *WAW\n} "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump "using an index-based WAR/WAW test" "vect" } } */
+/* { dg-final { scan-tree-dump-not "using an address-based" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-20.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-20.c
new file mode 100644
index 00000000000..8a699ebfda8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-20.c
@@ -0,0 +1,66 @@
+#define N 200
+#define DIST 32
+
+typedef signed char sc;
+typedef unsigned char uc;
+typedef signed short ss;
+typedef unsigned short us;
+typedef int si;
+typedef unsigned int ui;
+typedef signed long long sll;
+typedef unsigned long long ull;
+
+#define FOR_EACH_TYPE(M) \
+  M (sc) M (uc) \
+  M (ss) M (us) \
+  M (si) M (ui) \
+  M (sll) M (ull) \
+  M (float) M (double)
+
+#define TEST_VALUE(I) ((I) * 11 / 2)
+
+#define ADD_TEST(TYPE)				\
+  TYPE a_##TYPE[N * 2];				\
+  TYPE __attribute__((noinline, noclone))	\
+  test_##TYPE (int x, int y)			\
+  {						\
+    TYPE res = 0;				\
+    for (int i = 0; i < N; ++i)			\
+      {						\
+	a_##TYPE[i + x] = i;			\
+	res += a_##TYPE[i + y];			\
+      }						\
+    return res;					\
+  }
+
+#define DO_TEST(TYPE)						\
+  for (int i = 0; i < DIST * 2; ++i)				\
+    {								\
+      for (int j = 0; j < N + DIST * 2; ++j)			\
+	a_##TYPE[j] = TEST_VALUE (j);				\
+      TYPE res = test_##TYPE (DIST, i);				\
+      for (int j = 0; j < N; ++j)				\
+	if (a_##TYPE[j + DIST] != (TYPE) j)			\
+	  __builtin_abort ();					\
+      TYPE expected_res = 0;					\
+      for (int j = i; j < i + N; ++j)				\
+	if (i <= DIST && j >= DIST && j < DIST + N)		\
+	  expected_res += j - DIST;				\
+	else							\
+	  expected_res += TEST_VALUE (j);			\
+      if (expected_res != res)					\
+	__builtin_abort ();					\
+    }
+
+FOR_EACH_TYPE (ADD_TEST)
+
+int
+main (void)
+{
+  FOR_EACH_TYPE (DO_TEST)
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump {flags: *RAW\n} "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump "using an index-based overlap test" "vect" } } */
+/* { dg-final { scan-tree-dump-not "using an address-based" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-8.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-8.c
index 0569ca487b5..7e5df138999 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-alias-check-8.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-8.c
@@ -58,3 +58,7 @@ main (void)
   FOR_EACH_TYPE (DO_TEST)
   return 0;
 }
+
+/* { dg-final { scan-tree-dump {flags: *WAR\n} "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump "using an index-based WAR/WAW test" "vect" } } */
+/* { dg-final { scan-tree-dump-not "using an address-based" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-9.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-9.c
index 5685bfee576..a7fc1fcebbb 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-alias-check-9.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-9.c
@@ -17,7 +17,7 @@ typedef unsigned long long ull;
   M (sll) M (ull) \
   M (float) M (double)
 
-#define TEST_VALUE(I) ((I) * 5 / 2)
+#define TEST_VALUE(I) ((I) * 17 / 2)
 
 #define ADD_TEST(TYPE)				\
   void __attribute__((noinline, noclone))	\
@@ -51,3 +51,7 @@ main (void)
   FOR_EACH_TYPE (DO_TEST)
   return 0;
 }
+
+/* { dg-final { scan-tree-dump {flags: [^\n]*ARBITRARY\n} "vect" { target vect_int } } } */
+/* { dg-final { scan-tree-dump "using an address-based overlap test" "vect" } } */
+/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bswap16.c b/gcc/testsuite/gcc.dg/vect/vect-bswap16.c
index 3c98b07e425..d29b352b832 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-bswap16.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-bswap16.c
@@ -1,4 +1,4 @@
-/* { dg-require-effective-target vect_bswap } */
+/* { dg-additional-options "-msse4" { target sse4_runtime } } */
 
 #include "tree-vect.h"
 
@@ -39,4 +39,4 @@ main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_bswap || sse4_runtime } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bswap16a.c b/gcc/testsuite/gcc.dg/vect/vect-bswap16a.c
new file mode 100644
index 00000000000..730dc4e8352
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-bswap16a.c
@@ -0,0 +1,5 @@
+/* { dg-additional-options "-msse2 -mno-sse3" { target sse2_runtime } } */
+
+#include "vect-bswap16.c"
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_shift } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-3.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-3.c
new file mode 100644
index 00000000000..bb99b95eca5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-3.c
@@ -0,0 +1,47 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_condition } */
+/* { dg-require-effective-target vect_float } */
+
+#include "tree-vect.h"
+
+extern void abort (void) __attribute__ ((noreturn));
+
+#define N 27
+
+/* Condition reduction with different types.  */
+
+int
+condition_reduction (float *a, float min_v)
+{
+  int last = 0;
+
+  for (int i = 0; i < N; i++)
+    if (a[i] < min_v)
+      last = i;
+
+  return last;
+}
+
+int
+main (void)
+{
+  float a[N] = {
+  11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+  1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+  21, 22, 23, 24, 25, 26, 27
+  };
+
+  check_vect ();
+
+  int ret = condition_reduction (a, 10);
+  if (ret != 18)
+    abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */
+/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 2 "vect" { target { ! vect_fold_extract_last } } } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-4.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-4.c
new file mode 100644
index 00000000000..8820075b1dc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-4.c
@@ -0,0 +1,47 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_condition } */
+/* { dg-require-effective-target vect_double } */
+
+#include "tree-vect.h"
+
+extern void abort (void) __attribute__ ((noreturn));
+
+#define N 27
+
+/* Condition reduction with different types.  */
+
+int
+condition_reduction (double *a, double min_v)
+{
+  int last = 0;
+
+  for (int i = 0; i < N; i++)
+    if (a[i] < min_v)
+      last = i;
+
+  return last;
+}
+
+int
+main (void)
+{
+  double a[N] = {
+  11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+  1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+  21, 22, 23, 24, 25, 26, 27
+  };
+
+  check_vect ();
+
+  int ret = condition_reduction (a, 10);
+  if (ret != 18)
+    abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times "optimizing condition reduction with FOLD_EXTRACT_LAST" 2 "vect" { target vect_fold_extract_last } } } */
+/* { dg-final { scan-tree-dump-times "condition expression based on integer induction." 2 "vect" { target { ! vect_fold_extract_last } } } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/vect-double-reduc-5.c b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-5.c
index 0ba33895592..079704cee81 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-double-reduc-5.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-5.c
@@ -52,5 +52,5 @@ int main ()
 
 /* Vectorization of loops with multiple types and double reduction is not 
    supported yet.  */       
-/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail { ! aarch64*-*-* } } } } */
       
diff --git a/gcc/testsuite/gcc.dg/vect/vect-epilogues.c b/gcc/testsuite/gcc.dg/vect/vect-epilogues.c
new file mode 100644
index 00000000000..946666e918f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-epilogues.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+
+/* Copied from PR 88915.  */
+void pixel_avg( unsigned char *dst, int i_dst_stride,
+                               unsigned char *src1, int i_src1_stride,
+                               unsigned char *src2, int i_src2_stride,
+                               int i_width, int i_height )
+ {
+     for( int y = 0; y < i_height; y++ )
+     {
+         for( int x = 0; x < i_width; x++ )
+             dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
+         dst += i_dst_stride;
+         src1 += i_src1_stride;
+         src2 += i_src2_stride;
+     }
+ }
+
+/* { dg-final { scan-tree-dump "LOOP EPILOGUE VECTORIZED" "vect" { xfail { arm*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-live-1.c b/gcc/testsuite/gcc.dg/vect/vect-live-1.c
index e170875d7ab..f628c5d3998 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-live-1.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-live-1.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-additional-options "-fno-tree-scev-cprop" } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-live-2.c b/gcc/testsuite/gcc.dg/vect/vect-live-2.c
index a6daa61829e..19d8c22859e 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-live-2.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-live-2.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_long } */
 /* { dg-require-effective-target vect_shift } */
 /* { dg-additional-options "-fno-tree-scev-cprop" } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-live-3.c b/gcc/testsuite/gcc.dg/vect/vect-live-3.c
index 3ffa5166f45..8f5ccb27365 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-live-3.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-live-3.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.dg/vect/vect-live-4.c b/gcc/testsuite/gcc.dg/vect/vect-live-4.c
index 21cc27320ac..553ffcd49f7 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-live-4.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-live-4.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.dg/vect/vect-live-slp-1.c b/gcc/testsuite/gcc.dg/vect/vect-live-slp-1.c
index aff37c100f0..965437c8f03 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-live-slp-1.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-live-slp-1.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-additional-options "-fno-tree-scev-cprop" } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-live-slp-2.c b/gcc/testsuite/gcc.dg/vect/vect-live-slp-2.c
index 35689665b54..0d2f17f9003 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-live-slp-2.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-live-slp-2.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-additional-options "-fno-tree-scev-cprop" } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-live-slp-3.c b/gcc/testsuite/gcc.dg/vect/vect-live-slp-3.c
index 854116fa36e..a3f60f6ce6d 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-live-slp-3.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-live-slp-3.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_long } */
 /* { dg-additional-options "-fno-tree-scev-cprop" } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-multitypes-3.c b/gcc/testsuite/gcc.dg/vect/vect-multitypes-3.c
index 18bf5e80917..1f82121df06 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-multitypes-3.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-multitypes-3.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-add-options double_vectors } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c b/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c
index 43887865bf4..b0f74083f2b 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-add-options bind_pic_locally } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-multitypes-6.c b/gcc/testsuite/gcc.dg/vect/vect-multitypes-6.c
index b47a93ab326..864b17ac640 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-multitypes-6.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-multitypes-6.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-add-options double_vectors } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4e.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4e.c
index 13238dbe2f9..e65a092f5bf 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-outer-4e.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4e.c
@@ -23,4 +23,4 @@ foo (){
   return;
 }
 
-/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail { ! aarch64*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4f.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4f.c
index d1fbe346a48..a88014a2fbf 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-outer-4f.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4f.c
@@ -65,4 +65,4 @@ int main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail { ! aarch64*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4g.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4g.c
index d1fbe346a48..a88014a2fbf 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-outer-4g.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4g.c
@@ -65,4 +65,4 @@ int main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail { ! aarch64*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4k.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4k.c
index d1fbe346a48..a88014a2fbf 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-outer-4k.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4k.c
@@ -65,4 +65,4 @@ int main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail { ! aarch64*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4l.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4l.c
index d1fbe346a48..4f95c652ee3 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-outer-4l.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4l.c
@@ -65,4 +65,4 @@ int main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail { ! aarch64*-*-* } } } }*/
diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-call-1.c b/gcc/testsuite/gcc.dg/vect/vect-outer-call-1.c
new file mode 100644
index 00000000000..f26d4220532
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-outer-call-1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_float } */
+/* { dg-additional-options "-fno-math-errno" } */
+
+void
+foo (float * __restrict x, float *y, int n, int m)
+{
+  if (m > 0)
+    for (int i = 0; i < n; ++i)
+      {
+	float tem = x[i], tem1;
+	for (int j = 0; j < m; ++j)
+	  {
+	    tem += y[j];
+	    tem1 = tem;
+	    tem = __builtin_sqrtf (tem);
+	  }
+	x[i] = tem - tem1;
+      }
+}
+
+/* { dg-final { scan-tree-dump "OUTER LOOP VECTORIZED" "vect" { target { vect_call_sqrtf } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-1-epilogue.c b/gcc/testsuite/gcc.dg/vect/vect-peel-1-epilogue.c
new file mode 100644
index 00000000000..cc23c6b0866
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-peel-1-epilogue.c
@@ -0,0 +1,3 @@
+/* { dg-require-effective-target vect_int } */
+
+#include "vect-peel-1-src.c"
diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-1-src.c b/gcc/testsuite/gcc.dg/vect/vect-peel-1-src.c
new file mode 100644
index 00000000000..7980d4dd643
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-peel-1-src.c
@@ -0,0 +1,48 @@
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 128
+
+int ib[N+7];
+
+__attribute__ ((noinline))
+int main1 ()
+{
+  int i;
+  int ia[N+1];
+
+  /* All the accesses are misaligned. With cost model disabled, we
+     count the number of aligned accesses for each peeling option, and
+     in this case we align the two loads if possible (i.e., if
+     misaligned stores are supported).  */
+  for (i = 1; i <= N; i++)
+    {
+      ia[i] = ib[i+2] + ib[i+6];
+    }
+
+  /* check results:  */
+  for (i = 1; i <= N; i++)
+    {
+      if (ia[i] != ib[i+2] + ib[i+6])
+        abort ();
+    }
+
+  return 0;
+}
+
+int main (void)
+{ 
+  int i;
+
+  check_vect ();
+
+  for (i = 0; i <= N+6; i++)
+    {
+      asm volatile ("" : "+r" (i));
+      ib[i] = i;
+    }
+
+  return main1 ();
+}
+
+
diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-1.c b/gcc/testsuite/gcc.dg/vect/vect-peel-1.c
index fae99ab0b08..a7660a381c4 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-peel-1.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-peel-1.c
@@ -1,51 +1,8 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
-#include <stdarg.h>
-#include "tree-vect.h"
-
-#define N 128
-
-int ib[N+7];
-
-__attribute__ ((noinline))
-int main1 ()
-{
-  int i;
-  int ia[N+1];
-
-  /* All the accesses are misaligned. With cost model disabled, we
-     count the number of aligned accesses for each peeling option, and
-     in this case we align the two loads if possible (i.e., if
-     misaligned stores are supported).  */
-  for (i = 1; i <= N; i++)
-    {
-      ia[i] = ib[i+2] + ib[i+6];
-    }
-
-  /* check results:  */
-  for (i = 1; i <= N; i++)
-    {
-      if (ia[i] != ib[i+2] + ib[i+6])
-        abort ();
-    }
-
-  return 0;
-}
-
-int main (void)
-{ 
-  int i;
-
-  check_vect ();
-
-  for (i = 0; i <= N+6; i++)
-    {
-      asm volatile ("" : "+r" (i));
-      ib[i] = i;
-    }
-
-  return main1 ();
-}
+#include "vect-peel-1-src.c"
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
 /* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 1 "vect" { target { { vect_element_align } && { vect_aligned_arrays } } xfail { ! vect_unaligned_possible } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-3-epilogue.c b/gcc/testsuite/gcc.dg/vect/vect-peel-3-epilogue.c
new file mode 100644
index 00000000000..8af0fcdca0e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-peel-3-epilogue.c
@@ -0,0 +1,4 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-add-options bind_pic_locally } */
+
+#include "vect-peel-3-src.c"
diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-3-src.c b/gcc/testsuite/gcc.dg/vect/vect-peel-3-src.c
new file mode 100644
index 00000000000..a21ce8c3d6a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-peel-3-src.c
@@ -0,0 +1,58 @@
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#if VECTOR_BITS > 128
+#define NINTS (VECTOR_BITS / 32)
+#define EXTRA (NINTS * 2)
+#else
+#define NINTS 4
+#define EXTRA 10
+#endif
+
+#define N 128
+
+#define RES_A (N * N / 4)
+#define RES_B (N * (N + 1) / 2 + (NINTS + 3) * (N + 1))
+#define RES_C (N * (N + 1) / 2 + (N + 1))
+#define RES (RES_A + RES_B + RES_C)
+
+int ib[N + EXTRA];
+int ia[N + EXTRA];
+int ic[N + EXTRA];
+
+__attribute__ ((noinline))
+int main1 ()
+{
+  int i, suma = 0, sumb = 0, sumc = 0;
+
+  /* ib and ic have same misalignment, we peel to align them.  */
+  for (i = 0; i <= N; i++)
+    {
+      suma += ia[i];
+      sumb += ib[i + NINTS + 1];
+      sumc += ic[i + 1];
+    }
+
+  /* check results:  */
+  if (suma + sumb + sumc != RES)
+    abort ();
+
+  return 0;
+}
+
+int main (void)
+{
+  int i;
+
+  check_vect ();
+
+  for (i = 0; i < N + EXTRA; i++)
+    {
+      asm volatile ("" : "+r" (i));
+      ib[i] = i;
+      ic[i] = i+2;
+      ia[i] = i/2;
+    }
+
+  return main1 ();
+}
diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-3.c b/gcc/testsuite/gcc.dg/vect/vect-peel-3.c
index d5c0cf10ce1..2cd99573fd1 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-peel-3.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-peel-3.c
@@ -1,64 +1,9 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-add-options bind_pic_locally } */
 
-#include <stdarg.h>
-#include "tree-vect.h"
-
-#if VECTOR_BITS > 128
-#define NINTS (VECTOR_BITS / 32)
-#define EXTRA (NINTS * 2)
-#else
-#define NINTS 4
-#define EXTRA 10
-#endif
-
-#define N 128
-
-#define RES_A (N * N / 4)
-#define RES_B (N * (N + 1) / 2 + (NINTS + 3) * (N + 1))
-#define RES_C (N * (N + 1) / 2 + (N + 1))
-#define RES (RES_A + RES_B + RES_C)
-
-int ib[N + EXTRA];
-int ia[N + EXTRA];
-int ic[N + EXTRA];
-
-__attribute__ ((noinline))
-int main1 ()
-{
-  int i, suma = 0, sumb = 0, sumc = 0;
-
-  /* ib and ic have same misalignment, we peel to align them.  */
-  for (i = 0; i <= N; i++)
-    {
-      suma += ia[i];
-      sumb += ib[i + NINTS + 1];
-      sumc += ic[i + 1];
-    }
-
-  /* check results:  */
-  if (suma + sumb + sumc != RES)
-    abort ();
-
-  return 0;
-}
-
-int main (void)
-{
-  int i;
-
-  check_vect ();
-
-  for (i = 0; i < N + EXTRA; i++)
-    {
-      asm volatile ("" : "+r" (i));
-      ib[i] = i;
-      ic[i] = i+2;
-      ia[i] = i/2;
-    }
-
-  return main1 ();
-}
+#include "vect-peel-3-src.c"
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_align && { ! vect_hw_misalign } } } } } */
 /* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 1 "vect"  { xfail { { ! vect_unaligned_possible } || vect_sizes_32B_16B } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-4-epilogue.c b/gcc/testsuite/gcc.dg/vect/vect-peel-4-epilogue.c
new file mode 100644
index 00000000000..783982f04f6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-peel-4-epilogue.c
@@ -0,0 +1,4 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-add-options bind_pic_locally } */
+
+#include "vect-peel-4-src.c"
diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-4-src.c b/gcc/testsuite/gcc.dg/vect/vect-peel-4-src.c
new file mode 100644
index 00000000000..33088fb0902
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-peel-4-src.c
@@ -0,0 +1,45 @@
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 128
+
+int ib[N+7];
+int ia[N+1];
+
+__attribute__ ((noinline))
+int main1 ()
+{
+  int i;
+
+  /* Don't peel keeping one load and the store aligned.  */
+  for (i = 0; i <= N; i++)
+    {
+      ia[i] = ib[i] + ib[i+5];
+    }
+
+  /* check results:  */
+  for (i = 1; i <= N; i++)
+    {
+      if (ia[i] != ib[i] + ib[i+5])
+        abort ();
+    }
+
+  return 0;
+}
+
+int main (void)
+{ 
+  int i;
+
+  check_vect ();
+
+  for (i = 0; i <= N+6; i++)
+    {
+      asm volatile ("" : "+r" (i));
+      ib[i] = i;
+    }
+
+  return main1 ();
+}
+
+
diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-4.c b/gcc/testsuite/gcc.dg/vect/vect-peel-4.c
index 88f9f0ddcba..3b5272f284f 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-peel-4.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-peel-4.c
@@ -1,49 +1,9 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-add-options bind_pic_locally } */
 
-#include <stdarg.h>
-#include "tree-vect.h"
-
-#define N 128
-
-int ib[N+7];
-int ia[N+1];
-
-__attribute__ ((noinline))
-int main1 ()
-{
-  int i;
-
-  /* Don't peel keeping one load and the store aligned.  */
-  for (i = 0; i <= N; i++)
-    {
-      ia[i] = ib[i] + ib[i+5];
-    }
-
-  /* check results:  */
-  for (i = 1; i <= N; i++)
-    {
-      if (ia[i] != ib[i] + ib[i+5])
-        abort ();
-    }
-
-  return 0;
-}
-
-int main (void)
-{ 
-  int i;
-
-  check_vect ();
-
-  for (i = 0; i <= N+6; i++)
-    {
-      asm volatile ("" : "+r" (i));
-      ib[i] = i;
-    }
-
-  return main1 ();
-}
+#include "vect-peel-4-src.c"
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_align && { ! vect_hw_misalign } } } } } */
 /* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 1 "vect"  { xfail { ! vect_unaligned_possible } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-2char-big-array.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-2char-big-array.c
index e246ae7f3c6..c40f8625b84 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-2char-big-array.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-2char-big-array.c
@@ -62,4 +62,4 @@ int main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-2char.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-2char.c
index 5f0551ee372..dd3045502f1 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-2char.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-2char.c
@@ -46,4 +46,4 @@ int main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-2short.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-2short.c
index 02c2bee8612..1a2d8d04f4e 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-2short.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-2short.c
@@ -45,4 +45,4 @@ int main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-6.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-6.c
index ad148046a8e..cc0d9694a4f 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-6.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-6.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_float_strict } */
 /* { dg-additional-options "-fno-fast-math" } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16a.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16a.c
index 171451872e5..ffbc9706901 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16a.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16a.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8a.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8a.c
index ac674749b6f..05e343ad782 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8a.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8a.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
 /* { dg-additional-options "-march=armv8.2-a+dotprod" { target { aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8b.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8b.c
index b036ad5b0b4..e0f47d8a4f2 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8b.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8b.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
@@ -12,12 +14,6 @@ signed char Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
 
 /* char->short->short dot product.
    The dot-product pattern should be detected.
-   The reduction is currently not vectorized becaus of the signed->unsigned->signed
-   casts, since this patch:
-
-     2005-12-26  Kazu Hirata  <kazu@codesourcery.com>
-                                                                                                
-        PR tree-optimization/25125
 
    When the dot-product is detected, the loop should be vectorized on vect_sdot_qi 
    targets (targets that support dot-product of signed char).  
@@ -60,5 +56,5 @@ int main (void)
 /* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 1 "vect" { xfail *-*-* } } } */
 /* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected" 1 "vect" } } */
 
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u16b.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u16b.c
index 57e18040cf2..0fc112012cf 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u16b.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u16b.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8a.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8a.c
index d020f643bb8..e23ebd9b072 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8a.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8a.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
 /* { dg-additional-options "-march=armv8.2-a+dotprod" { target { aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8b.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8b.c
index 3155d97b3cd..288be13440d 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8b.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8b.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-epilogue-gaps.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-epilogue-gaps.c
new file mode 100644
index 00000000000..dc5704f5607
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-epilogue-gaps.c
@@ -0,0 +1,45 @@
+/* { dg-options "-O3 -fno-vect-cost-model" } */
+struct {
+    float real;
+    float img;
+} g[11];
+
+float __attribute__ ((noclone))
+foo_11 (void)
+{
+  float sum = 0.0;
+  for (int i = 0; i < 11; ++i)
+    sum += g[i].real;
+  return sum;
+}
+
+float __attribute__ ((noclone))
+foo_10 (void)
+{
+  float sum = 0.0;
+  for (int i = 0; i < 10; ++i)
+    sum += g[i].real;
+  return sum;
+}
+
+int main (void)
+{
+  float check_10 = 0.0;
+  float check_11 = 0.0;
+  for (int i = 0; i < 11; ++i)
+    {
+      asm volatile ("" : : : "memory");
+      g[i].real = (float) i;
+      g[i].img = (float) -i;
+      if (i < 10)
+	check_10 += (float) i;
+      check_11 += (float) i;
+    }
+
+  if (foo_10 () != check_10)
+    __builtin_abort ();
+  if (foo_11 () != check_11)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1a.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1a.c
index b06b234072b..1ddbe96ebc3 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1a.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1a.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1b-big-array.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1b-big-array.c
index be03c7d011d..7ae2c838344 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1b-big-array.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1b-big-array.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1c-big-array.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1c-big-array.c
index c30c85ce911..91ce0ef934e 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1c-big-array.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1c-big-array.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2a.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2a.c
index a98edd3045a..2190eaa6242 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2a.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2a.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2b-big-array.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2b-big-array.c
index 570e56a8c9b..6ad645b3bdd 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2b-big-array.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2b-big-array.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2c.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2c.c
index 8190622d5d7..71df5741e16 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2c.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2c.c
@@ -21,6 +21,8 @@ foo ()
      2005-12-26  Kazu Hirata  <kazu@codesourcery.com>
                                                                                 
         PR tree-optimization/25125
+
+     but we still handle the reduction.
    */
 
   for (i = 0; i < N; i++)
@@ -43,5 +45,4 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: detected" 1 "vect" { xfail *-*-* } } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail *-*-* } } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" { target { ! vect_widen_sum_qi_to_hi } } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-sad.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-sad.c
index a033a7d27d1..2f0bb692564 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-sad.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-sad.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_usad_char } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-tail-nomask-1.c b/gcc/testsuite/gcc.dg/vect/vect-tail-nomask-1.c
index b912a3431f7..e5bbeaede09 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-tail-nomask-1.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-tail-nomask-1.c
@@ -106,4 +106,4 @@ main (int argc, const char **argv)
 }
 
 /* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" { target avx2_runtime } } } */
-/* { dg-final { scan-tree-dump-times "LOOP EPILOGUE VECTORIZED \\(VS=16\\)" 2 "vect" { target avx2_runtime } } } */
+/* { dg-final { scan-tree-dump-times "LOOP EPILOGUE VECTORIZED \\(MODE=V16QI\\)" 2 "vect" { target avx2_runtime } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-const-s16.c b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-const-s16.c
index 89f983cad06..4c95dd20179 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-const-s16.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-const-s16.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-additional-options "-fno-ipa-icf" } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-const-u16.c b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-const-u16.c
index e319699cd92..4075f815cea 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-const-u16.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-const-u16.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-additional-options "-fno-ipa-icf" } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-half-u8.c b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-half-u8.c
index ee0538c0635..c4ac88e186d 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-half-u8.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-half-u8.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-additional-options "-fno-ipa-icf" } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-half.c b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-half.c
index 6d74c693316..ebbf4f5e841 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-half.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-half.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u16.c b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u16.c
index 942f63d6f31..2e28baae0b8 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u16.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u16.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c
index 98f78d3b37a..d277f0b2b94 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-dot-s8b.c b/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-dot-s8b.c
index 176f183f3ce..6fc7a282351 100644
--- a/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-dot-s8b.c
+++ b/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-dot-s8b.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
diff --git a/gcc/testsuite/gcc.dg/vshift-5.c b/gcc/testsuite/gcc.dg/vshift-5.c
index daa5f1c5cd8..62e6328cb28 100644
--- a/gcc/testsuite/gcc.dg/vshift-5.c
+++ b/gcc/testsuite/gcc.dg/vshift-5.c
@@ -40,6 +40,42 @@ f2 (void)
   a[3] = a3;
 }
 
+__attribute__((noinline, noclone)) void
+f2a (int x)
+{
+  long long a0, a1, a2, a3;
+  a0 = a[0];
+  a1 = a[1];
+  a2 = a[2];
+  a3 = a[3];
+  a0 = a0 << x;
+  a1 = a1 << 2;
+  a2 = a2 << 2;
+  a3 = a3 << 2;
+  a[0] = a0;
+  a[1] = a1;
+  a[2] = a2;
+  a[3] = a3;
+}
+
+__attribute__((noinline, noclone)) void
+f2b (int x)
+{
+  long long a0, a1, a2, a3;
+  a0 = a[0];
+  a1 = a[1];
+  a2 = a[2];
+  a3 = a[3];
+  a0 = a0 << 2;
+  a1 = a1 << 2;
+  a2 = a2 << x;
+  a3 = a3 << 2;
+  a[0] = a0;
+  a[1] = a1;
+  a[2] = a2;
+  a[3] = a3;
+}
+
 __attribute__((noinline, noclone)) void
 f3 (int x)
 {
@@ -77,5 +113,13 @@ main ()
   if (a[0] != (4LL << 7) || a[1] != (3LL << 8)
       || a[2] != (2LL << 9) || a[3] != (1LL << 10))
     abort ();
+  f2a (3);
+  if (a[0] != (4LL << 10) || a[1] != (3LL << 10)
+      || a[2] != (2LL << 11) || a[3] != (1LL << 12))
+    abort ();
+  f2b (3);
+  if (a[0] != (4LL << 12) || a[1] != (3LL << 12)
+      || a[2] != (2LL << 14) || a[3] != (1LL << 14))
+    abort ();
   return 0;
 }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/dot_1.c b/gcc/testsuite/gcc.target/aarch64/sve/dot_1.c
new file mode 100644
index 00000000000..8ff66714e9b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/dot_1.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_DOT(TYPE1, TYPE2)						\
+TYPE1 __attribute__ ((noinline, noclone))				\
+dot_##TYPE1##_##TYPE2 (TYPE2 *restrict x, TYPE2 *restrict y, int n)	\
+{									\
+  TYPE1 sum = 0;							\
+  for (int i = 0; i < n; i++)						\
+    {									\
+      sum += x[i] * y[i];						\
+    }									\
+  return sum;								\
+}
+
+DEF_DOT(uint32_t, uint8_t)
+DEF_DOT(int32_t, int8_t)
+DEF_DOT(int64_t, int16_t)
+
+/* The uint16_t->uint64_t dot product requires a casting to satisfy the C
+   language rules.  */
+uint64_t __attribute__ ((noinline, noclone))
+dot_uint64_t_uint16_t (uint16_t *restrict x, uint16_t *restrict y, int n)
+{
+  uint64_t sum = 0;
+  for (int i = 0; i < n; i++)
+    {
+      sum += (unsigned int)x[i] * y[i];
+    }
+  return sum;
+}
+
+/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.s, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsdot\tz[0-9]+\.s, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.d, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsdot\tz[0-9]+\.d, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\t} 8 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c
index 5c04bcdb3f5..51925fa8f50 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c
@@ -17,3 +17,4 @@ f (double *restrict a, double *restrict b, double *restrict c,
 
 /* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
 /* { dg-final { scan-assembler-not {\tfmad\t} } } */
+/* { dg-final { scan-assembler-times {\tst1d} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c
new file mode 100644
index 00000000000..78c70b2be32
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c
@@ -0,0 +1,90 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define MASK_SLP_2(TYPE_COND, ALT_VAL)					\
+void __attribute__ ((noinline, noclone))				\
+mask_slp_##TYPE_COND##_2_##ALT_VAL (int *restrict x, int *restrict y,	\
+				    TYPE_COND *restrict z, int n)	\
+{									\
+  for (int i = 0; i < n; i += 2)					\
+    {									\
+      x[i] = y[i] ? z[i] : 1;						\
+      x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL;				\
+    }									\
+}
+
+#define MASK_SLP_4(TYPE_COND, ALT_VAL)					\
+void __attribute__ ((noinline, noclone))				\
+mask_slp_##TYPE_COND##_4_##ALT_VAL (int *restrict x, int *restrict y,	\
+				    TYPE_COND *restrict z, int n)	\
+{									\
+  for (int i = 0; i < n; i += 4)					\
+    {									\
+      x[i] = y[i] ? z[i] : 1;						\
+      x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL;				\
+      x[i + 2] = y[i + 2] ? z[i + 2] : 1;				\
+      x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL;				\
+    }									\
+}
+
+#define MASK_SLP_8(TYPE_COND, ALT_VAL)					\
+void __attribute__ ((noinline, noclone))				\
+mask_slp_##TYPE_COND##_8_##ALT_VAL (int *restrict x, int *restrict y,	\
+				    TYPE_COND *restrict z, int n)	\
+{									\
+  for (int i = 0; i < n; i += 8)					\
+    {									\
+      x[i] = y[i] ? z[i] : 1;						\
+      x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL;				\
+      x[i + 2] = y[i + 2] ? z[i + 2] : 1;				\
+      x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL;				\
+      x[i + 4] = y[i + 4] ? z[i + 4] : 1;				\
+      x[i + 5] = y[i + 5] ? z[i + 5] : ALT_VAL;				\
+      x[i + 6] = y[i + 6] ? z[i + 6] : 1;				\
+      x[i + 7] = y[i + 7] ? z[i + 7] : ALT_VAL;				\
+    }									\
+}
+
+#define MASK_SLP_FAIL(TYPE_COND)					\
+void __attribute__ ((noinline, noclone))				\
+mask_slp_##TYPE_COND##_FAIL (int *restrict x, int *restrict y,		\
+			     TYPE_COND *restrict z, int n)		\
+{									\
+  for (int i = 0; i < n; i += 2)					\
+    {									\
+      x[i] = y[i] ? z[i] : 1;						\
+      x[i + 1] = y[i + 1] ? z[i + 1] : x[z[i + 1]];			\
+    }									\
+}
+
+MASK_SLP_2(int8_t, 1)
+MASK_SLP_2(int8_t, 2)
+MASK_SLP_2(int, 1)
+MASK_SLP_2(int, 2)
+MASK_SLP_2(int64_t, 1)
+MASK_SLP_2(int64_t, 2)
+
+MASK_SLP_4(int8_t, 1)
+MASK_SLP_4(int8_t, 2)
+MASK_SLP_4(int, 1)
+MASK_SLP_4(int, 2)
+MASK_SLP_4(int64_t, 1)
+MASK_SLP_4(int64_t, 2)
+
+MASK_SLP_8(int8_t, 1)
+MASK_SLP_8(int8_t, 2)
+MASK_SLP_8(int, 1)
+MASK_SLP_8(int, 2)
+MASK_SLP_8(int64_t, 1)
+MASK_SLP_8(int64_t, 2)
+
+MASK_SLP_FAIL(int8_t)
+MASK_SLP_FAIL(int)
+MASK_SLP_FAIL(int64_t)
+
+/* { dg-final { scan-assembler-not {\tld2w\t} } } */
+/* { dg-final { scan-assembler-not {\tst2w\t} } } */
+/* { dg-final { scan-assembler-times {\tld1w\t} 48 } } */
+/* { dg-final { scan-assembler-times {\tst1w\t} 40 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_1.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_1.c
index a258344b0a9..f152d04b473 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/reduc_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_1.c
@@ -105,8 +105,8 @@ reduc_##NAME##_##TYPE (TYPE *a, int n)		\
 
 TEST_BITWISE (DEF_REDUC_BITWISE)
 
-/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
 
@@ -157,8 +157,8 @@ TEST_BITWISE (DEF_REDUC_BITWISE)
 /* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
 /* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
 
-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tfaddv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_2.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_2.c
index 376a453fc73..0640cba8e0f 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/reduc_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_2.c
@@ -116,8 +116,8 @@ reduc_##NAME##TYPE (TYPE (*restrict a)[NUM_ELEMS(TYPE)],	\
 
 TEST_BITWISE (DEF_REDUC_BITWISE)
 
-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tfaddv\th[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_5.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_5.c
index ff535942331..cced4ad488e 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/reduc_5.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_5.c
@@ -23,16 +23,12 @@ REDUC (uint64_t)
 REDUC (float)
 REDUC (double)
 
-/* XFAILed until we support sub-int reductions for signed types.  */
-/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m} 2 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m} 2 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m} 1 } } */
-/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m} 1 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m} 2 } } */
 /* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, p[0-7]/m} 2 } } */
 /* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, p[0-7]/m} 2 } } */
 /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m} 1 } } */
 /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m} 1 } } */
 
-/* XFAILed until we support sub-int reductions for signed types.  */
-/* { dg-final { scan-assembler-times {\tsub\t} 8 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tsub\t} 8 } } */
 /* { dg-final { scan-assembler-times {\tfsub\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_8.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_8.c
index 3913b8848c0..dec4c87e54d 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/reduc_8.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_8.c
@@ -15,6 +15,5 @@ reduc (int *restrict a, int *restrict b, int *restrict c)
 }
 
 /* { dg-final { scan-assembler-times {\tcmpne\tp[0-9]+\.s, } 1 } } */
-/* We ought to use the CMPNE result for the SEL too.  */
-/* { dg-final { scan-assembler-not {\tcmpeq\tp[0-9]+\.s, } { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-not {\tcmpeq\tp[0-9]+\.s, } } } */
 /* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.s, } 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c
index a718e9d2ebf..83ebec50bc6 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c
@@ -1,10 +1,7 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize -fno-inline -msve-vector-bits=256 -fdump-tree-vect-details" } */
+/* { dg-options "-O2 -ftree-vectorize" } */
 
-double mat[100][4];
-double mat2[100][8];
-double mat3[100][12];
-double mat4[100][3];
+double mat[100][2];
 
 double
 slp_reduc_plus (int n)
@@ -14,115 +11,8 @@ slp_reduc_plus (int n)
     {
       tmp = tmp + mat[i][0];
       tmp = tmp + mat[i][1];
-      tmp = tmp + mat[i][2];
-      tmp = tmp + mat[i][3];
     }
   return tmp;
 }
 
-double
-slp_reduc_plus2 (int n)
-{
-  double tmp = 0.0;
-  for (int i = 0; i < n; i++)
-    {
-      tmp = tmp + mat2[i][0];
-      tmp = tmp + mat2[i][1];
-      tmp = tmp + mat2[i][2];
-      tmp = tmp + mat2[i][3];
-      tmp = tmp + mat2[i][4];
-      tmp = tmp + mat2[i][5];
-      tmp = tmp + mat2[i][6];
-      tmp = tmp + mat2[i][7];
-    }
-  return tmp;
-}
-
-double
-slp_reduc_plus3 (int n)
-{
-  double tmp = 0.0;
-  for (int i = 0; i < n; i++)
-    {
-      tmp = tmp + mat3[i][0];
-      tmp = tmp + mat3[i][1];
-      tmp = tmp + mat3[i][2];
-      tmp = tmp + mat3[i][3];
-      tmp = tmp + mat3[i][4];
-      tmp = tmp + mat3[i][5];
-      tmp = tmp + mat3[i][6];
-      tmp = tmp + mat3[i][7];
-      tmp = tmp + mat3[i][8];
-      tmp = tmp + mat3[i][9];
-      tmp = tmp + mat3[i][10];
-      tmp = tmp + mat3[i][11];
-    }
-  return tmp;
-}
-
-void
-slp_non_chained_reduc (int n, double * restrict out)
-{
-  for (int i = 0; i < 3; i++)
-    out[i] = 0;
-
-  for (int i = 0; i < n; i++)
-    {
-      out[0] = out[0] + mat4[i][0];
-      out[1] = out[1] + mat4[i][1];
-      out[2] = out[2] + mat4[i][2];
-    }
-}
-
-/* Strict FP reductions shouldn't be used for the outer loops, only the
-   inner loops.  */
-
-float
-double_reduc1 (float (*restrict i)[16])
-{
-  float l = 0;
-
-  for (int a = 0; a < 8; a++)
-    for (int b = 0; b < 8; b++)
-      l += i[b][a];
-  return l;
-}
-
-float
-double_reduc2 (float *restrict i)
-{
-  float l = 0;
-
-  for (int a = 0; a < 8; a++)
-    for (int b = 0; b < 16; b++)
-      {
-        l += i[b * 4];
-        l += i[b * 4 + 1];
-        l += i[b * 4 + 2];
-        l += i[b * 4 + 3];
-      }
-  return l;
-}
-
-float
-double_reduc3 (float *restrict i, float *restrict j)
-{
-  float k = 0, l = 0;
-
-  for (int a = 0; a < 8; a++)
-    for (int b = 0; b < 8; b++)
-      {
-        k += i[b];
-        l += j[b];
-      }
-  return l * k;
-}
-
-/* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s} 4 } } */
-/* { dg-final { scan-assembler-times {\tfadda\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d} 9 } } */
-/* 1 reduction each for double_reduc{1,2} and 2 for double_reduc3.  Each one
-   is reported three times, once for SVE, once for 128-bit AdvSIMD and once
-   for 64-bit AdvSIMD.  */
-/* { dg-final { scan-tree-dump-times "Detected double reduction" 12 "vect" } } */
-/* double_reduc2 has 2 reductions and slp_non_chained_reduc has 3.  */
-/* { dg-final { scan-tree-dump-times "Detected reduction" 10 "vect" } } */
+/* { dg-final { scan-assembler-times {\tfadda\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_13.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_13.c
index 0b2a7ad57e3..37b5f1148a3 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_13.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_13.c
@@ -32,7 +32,6 @@ vec_slp_##TYPE (TYPE *restrict a, int n)			\
 
 TEST_ALL (VEC_PERM)
 
-/* ??? We don't treat the int8_t and int16_t loops as reductions.  */
 /* ??? We don't treat the uint loops as SLP.  */
 /* The loop should be fully-masked.  */
 /* { dg-final { scan-assembler-times {\tld1b\t} 2 { xfail *-*-* } } } */
@@ -41,15 +40,15 @@ TEST_ALL (VEC_PERM)
 /* { dg-final { scan-assembler-times {\tld1w\t} 2 } } */
 /* { dg-final { scan-assembler-times {\tld1d\t} 3 { xfail *-*-* } } } */
 /* { dg-final { scan-assembler-times {\tld1d\t} 2 } } */
-/* { dg-final { scan-assembler-not {\tldr} { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-not {\tldr} } } */
 
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b} 4 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h} 6 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b} 4 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h} 6 } } */
 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */
 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */
 
-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b\n} 2 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h\n} 2 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tfadda\th[0-9]+, p[0-7], h[0-9]+, z[0-9]+\.h\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_5.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_5.c
index b75edc69e2d..6a199d00659 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_5.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_5.c
@@ -33,34 +33,24 @@ vec_slp_##TYPE (TYPE *restrict a, TYPE *restrict b, int n)	\
 
 TEST_ALL (VEC_PERM)
 
-/* ??? We don't think it's worth using SLP for the 64-bit loops and fall
-   back to the less efficient non-SLP implementation instead.  */
-/* ??? At present we don't treat the int8_t and int16_t loops as
-   reductions.  */
-/* { dg-final { scan-assembler-times {\tld1b\t} 2 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tld1h\t} 3 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */
-/* { dg-final { scan-assembler-times {\tld1h\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1b\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1w\t} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1d\t} 3 } } */
 /* { dg-final { scan-assembler-not {\tld2b\t} } } */
 /* { dg-final { scan-assembler-not {\tld2h\t} } } */
 /* { dg-final { scan-assembler-not {\tld2w\t} } } */
 /* { dg-final { scan-assembler-not {\tld2d\t} } } */
-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 4 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 4 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 2 } } */
-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 2 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 4 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */
 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s} 4 } } */
 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 4 } } */
 /* { dg-final { scan-assembler-times {\tfaddv\th[0-9]+, p[0-7], z[0-9]+\.h} 2 } } */
 /* { dg-final { scan-assembler-times {\tfaddv\ts[0-9]+, p[0-7], z[0-9]+\.s} 2 } } */
 /* { dg-final { scan-assembler-times {\tfaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 2 } } */
 
-/* Should be 4 and 6 respectively, if we used reductions for int8_t and
-   int16_t.  */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h} 4 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b} 4 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h} 6 } } */
 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */
 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_7.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_7.c
index 9e6aa8ccbf8..19207207999 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_7.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_7.c
@@ -31,45 +31,27 @@ vec_slp_##TYPE (TYPE *restrict a, TYPE *restrict b, int n)	\
   T (uint16_t)					\
   T (int32_t)					\
   T (uint32_t)					\
-  T (int64_t)					\
-  T (uint64_t)					\
   T (_Float16)					\
-  T (float)					\
-  T (double)
+  T (float)
 
 TEST_ALL (VEC_PERM)
 
-/* We can't use SLP for the 64-bit loops, since the number of reduction
-   results might be greater than the number of elements in the vector.
-   Otherwise we have two loads per loop, one for the initial vector
-   and one for the loop body.  */
-/* ??? At present we don't treat the int8_t and int16_t loops as
-   reductions.  */
-/* { dg-final { scan-assembler-times {\tld1b\t} 2 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tld1h\t} 3 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */
-/* { dg-final { scan-assembler-times {\tld1h\t} 2 } } */
+/* We have two loads per loop, one for the initial vector and one for
+   the loop body.  */
+/* { dg-final { scan-assembler-times {\tld1b\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1w\t} 3 } } */
-/* { dg-final { scan-assembler-times {\tld4d\t} 3 } } */
 /* { dg-final { scan-assembler-not {\tld4b\t} } } */
 /* { dg-final { scan-assembler-not {\tld4h\t} } } */
 /* { dg-final { scan-assembler-not {\tld4w\t} } } */
-/* { dg-final { scan-assembler-not {\tld1d\t} } } */
-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 8 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 8 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 4 } } */
-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 8 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 8 } } */
 /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s} 8 } } */
-/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 8 } } */
 /* { dg-final { scan-assembler-times {\tfaddv\th[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */
 /* { dg-final { scan-assembler-times {\tfaddv\ts[0-9]+, p[0-7], z[0-9]+\.s} 4 } } */
-/* { dg-final { scan-assembler-times {\tfaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 4 } } */
 
-/* Should be 4 and 6 respectively, if we used reductions for int8_t and
-   int16_t.  */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b} 2 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h} 4 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b} 4 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h} 6 } } */
 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */
 
 /* { dg-final { scan-assembler-not {\tuqdec} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.c
index 68baba9e965..40ff2d561a8 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.c
@@ -15,12 +15,9 @@ f (TYPE *x, TYPE *y, unsigned short n, l
 /* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */
 /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
 /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
-/* Should multiply by (VF-1)*4 rather than (257-1)*4.  */
-/* { dg-final { scan-assembler-not {, 1024} } } */
-/* { dg-final { scan-assembler-not {lsl[^\n]*[, ]10} } } */
-/* { dg-final { scan-assembler-not {\tcmp\tx[0-9]+, 0} } } */
-/* { dg-final { scan-assembler-not {\tcmp\tw[0-9]+, 0} } } */
-/* { dg-final { scan-assembler-not {\tcsel\tx[0-9]+} } } */
-/* Two range checks and a check for n being zero.  */
-/* { dg-final { scan-assembler-times {\tcmp\t} 1 } } */
-/* { dg-final { scan-assembler-times {\tccmp\t} 2 } } */
+/* Should use a WAR check that multiplies by (VF-2)*4 rather than
+   an overlap check that multiplies by (257-1)*4.  */
+/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #8\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */
+/* One range check and a check for n being zero.  */
+/* { dg-final { scan-assembler-times {\t(?:cmp|tst)\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c
index 30f6d2691b8..b8afea70207 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c
@@ -15,7 +15,7 @@ f (TYPE *x, TYPE *y, unsigned short n, unsigned short m)
 /* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */
 /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
 /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
-/* Should multiply by (257-1)*4 rather than (VF-1)*4.  */
+/* Should multiply by (257-1)*4 rather than (VF-1)*4 or (VF-2)*4.  */
 /* { dg-final { scan-assembler-times {\tubfiz\tx[0-9]+, x2, 10, 16\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tubfiz\tx[0-9]+, x3, 10, 16\n} 1 } } */
 /* { dg-final { scan-assembler-not {\tcmp\tx[0-9]+, 0} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_3.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_3.c
index 70792ff9f33..5ab6859ad4e 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_3.c
@@ -15,13 +15,10 @@ f (TYPE *x, TYPE *y, int n, long m __attribute__((unused)))
 /* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */
 /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
 /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
-/* Should multiply by (VF-1)*4 rather than (257-1)*4.  */
-/* { dg-final { scan-assembler-not {, 1024} } } */
-/* { dg-final { scan-assembler-not {\t.bfiz\t} } } */
-/* { dg-final { scan-assembler-not {lsl[^\n]*[, ]10} } } */
-/* { dg-final { scan-assembler-not {\tcmp\tx[0-9]+, 0} } } */
-/* { dg-final { scan-assembler {\tcmp\tw2, 0} } } */
-/* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+} 2 } } */
-/* Two range checks and a check for n being zero.  */
-/* { dg-final { scan-assembler {\tcmp\t} } } */
-/* { dg-final { scan-assembler-times {\tccmp\t} 2 } } */
+/* Should use a WAR check that multiplies by (VF-2)*4 rather than
+   an overlap check that multiplies by (257-1)*4.  */
+/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #8\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */
+/* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+[^\n]*xzr} 1 } } */
+/* One range check and a check for n being zero.  */
+/* { dg-final { scan-assembler-times {\tcmp\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_5.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_5.c
index 688f3be61d7..93c114193e9 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_5.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_5.c
@@ -15,13 +15,10 @@ f (TYPE *x, TYPE *y, long n, long m __attribute__((unused)))
 /* { dg-final { scan-assembler {\tst1d\tz[0-9]+} } } */
 /* { dg-final { scan-assembler {\tldr\td[0-9]+} } } */
 /* { dg-final { scan-assembler {\tstr\td[0-9]+} } } */
-/* Should multiply by (VF-1)*8 rather than (257-1)*8.  */
-/* { dg-final { scan-assembler-not {, 2048} } } */
-/* { dg-final { scan-assembler-not {\t.bfiz\t} } } */
-/* { dg-final { scan-assembler-not {lsl[^\n]*[, ]11} } } */
-/* { dg-final { scan-assembler {\tcmp\tx[0-9]+, 0} } } */
-/* { dg-final { scan-assembler-not {\tcmp\tw[0-9]+, 0} } } */
-/* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+} 2 } } */
-/* Two range checks and a check for n being zero.  */
-/* { dg-final { scan-assembler {\tcmp\t} } } */
-/* { dg-final { scan-assembler-times {\tccmp\t} 2 } } */
+/* Should use a WAR check that multiplies by (VF-2)*8 rather than
+   an overlap check that multiplies by (257-1)*4.  */
+/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #16\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */
+/* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+[^\n]*xzr} 1 } } */
+/* One range check and a check for n being zero.  */
+/* { dg-final { scan-assembler-times {\tcmp\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c
index 00d84760a19..b38f23e87ba 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c
@@ -98,24 +98,24 @@ TEST_CMP (nugt)
 /* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
 
 /* 5 for lt, 5 for ult and 5 for nult.  */
-/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */
 
 /* 5 for le, 5 for ule and 5 for nule.  */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */
 
 /* 5 for gt, 5 for ugt and 5 for nugt.  */
-/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */
 
 /* 5 for ge, 5 for uge and 5 for nuge.  */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */
 
 /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} } } */
 /* 3 loops * 5 invocations for all 12 unordered comparisons.  */
-/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 180 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 180 } } */
 
 /* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 7 { xfail *-*-* } } } */
 /* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 14 { xfail *-*-* } } } */
@@ -123,19 +123,19 @@ TEST_CMP (nugt)
 /* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
 /* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
 
-/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */
 
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */
 
-/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */
 
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */
 
 /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} } } */
 /* 3 loops * 5 invocations, with 2 invocations having ncopies == 2,
    for all 12 unordered comparisons.  */
-/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 252 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 252 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c
index 23bfb7b2649..2f16fbff522 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c
@@ -19,16 +19,16 @@
 /* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 40 { xfail *-*-* } } } */
 
 /* 5 for le, 5 for ule and 5 for nule.  */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 } } */
 
 /* 5 for gt, 5 for ugt, 5 for nueq and 5 for nugt.  */
 /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 20 { xfail *-*-* } } } */
 /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 40 { xfail *-*-* } } } */
 
 /* 5 for ge, 5 for uge and 5 for nuge.  */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 } } */
 
 /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} } } */
 /* 3 loops * 5 invocations for ordered, unordered amd ueq.  */
@@ -43,14 +43,14 @@
 /* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 28 { xfail *-*-* } } } */
 /* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 56 { xfail *-*-* } } } */
 
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 } } */
 
 /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 28 { xfail *-*-* } } } */
 /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 56 { xfail *-*-* } } } */
 
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 } } */
 
 /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} } } */
 /* 3 loops * 5 invocations, with 2 invocations having ncopies == 2,
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_1.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_1.c
new file mode 100644
index 00000000000..fe490cfbf3f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_1.c
@@ -0,0 +1,18 @@
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#pragma GCC target "+nosve"
+
+#include <stdint.h>
+
+void
+f (int64_t *x, int64_t *y, int32_t *z, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[i] += y[i];
+      z[i] += z[i - 2];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.2d,} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.2s,} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_10.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_10.c
new file mode 100644
index 00000000000..81e77a8bb04
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_10.c
@@ -0,0 +1,18 @@
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#pragma GCC target "+nosve"
+
+#include <stdint.h>
+
+void
+f (int16_t *x, int16_t *y, uint8_t *z, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[i] = z[i];
+      y[i] += y[i - 8];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tuxtl\tv[0-9]+\.8h, v[0-9]+\.8b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.8h,} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_11.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_11.c
new file mode 100644
index 00000000000..d9da6c1f12a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_11.c
@@ -0,0 +1,18 @@
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#pragma GCC target "+nosve"
+
+#include <stdint.h>
+
+void
+f (int32_t *x, int64_t *y, int64_t *z, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[i] = z[i];
+      y[i] += y[i - 2];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\txtn\tv[0-9]+\.2s, v[0-9]+\.2d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.2d,} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_12.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_12.c
new file mode 100644
index 00000000000..80dab8bf55f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_12.c
@@ -0,0 +1,18 @@
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#pragma GCC target "+nosve"
+
+#include <stdint.h>
+
+void
+f (int16_t *x, int32_t *y, int32_t *z, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[i] = z[i];
+      y[i] += y[i - 4];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\txtn\tv[0-9]+\.4h, v[0-9]+\.4s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.4s,} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_13.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_13.c
new file mode 100644
index 00000000000..655fa7d4bf1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_13.c
@@ -0,0 +1,18 @@
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#pragma GCC target "+nosve"
+
+#include <stdint.h>
+
+void
+f (int8_t *x, int16_t *y, int16_t *z, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[i] = z[i];
+      y[i] += y[i - 8];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\txtn\tv[0-9]+\.8b, v[0-9]+\.8h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.8h,} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_2.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_2.c
new file mode 100644
index 00000000000..1fe69cad259
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_2.c
@@ -0,0 +1,19 @@
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#pragma GCC target "+nosve"
+
+#include <stdint.h>
+
+void
+f (int32_t *x, int32_t *y, int16_t *z, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[i] += y[i];
+      z[i] += z[i - 4];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.4s,} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.4h,} 1 } } */
+/* { dg-final { scan-assembler-not {\tadd\tv[0-9]+\.2s,} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_3.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_3.c
new file mode 100644
index 00000000000..1290772216e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_3.c
@@ -0,0 +1,19 @@
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#pragma GCC target "+nosve"
+
+#include <stdint.h>
+
+void
+f (int16_t *x, int16_t *y, int8_t *z, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[i] += y[i];
+      z[i] += z[i - 8];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.8h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.8b,} 1 } } */
+/* { dg-final { scan-assembler-not {\tadd\tv[0-9]+\.4h,} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_4.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_4.c
new file mode 100644
index 00000000000..768ea8c7164
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_4.c
@@ -0,0 +1,18 @@
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#pragma GCC target "+nosve"
+
+#include <stdint.h>
+
+void
+f (int64_t *x, int64_t *y, int8_t *z, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[i] += y[i];
+      z[i] += z[i - 8];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.2d,} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.8b,} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_5.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_5.c
new file mode 100644
index 00000000000..ca8a65a16e7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_5.c
@@ -0,0 +1,18 @@
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#pragma GCC target "+nosve"
+
+#include <stdint.h>
+
+void
+f (int64_t *x, int64_t *y, int32_t *z, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[i] = z[i];
+      y[i] += y[i - 2];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tsxtl\tv[0-9]+\.2d, v[0-9]+\.2s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.2d,} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_6.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_6.c
new file mode 100644
index 00000000000..6c09b5b146b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_6.c
@@ -0,0 +1,18 @@
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#pragma GCC target "+nosve"
+
+#include <stdint.h>
+
+void
+f (int32_t *x, int32_t *y, int16_t *z, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[i] = z[i];
+      y[i] += y[i - 4];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tsxtl\tv[0-9]+\.4s, v[0-9]+\.4h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.4s,} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_7.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_7.c
new file mode 100644
index 00000000000..94a66c545ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_7.c
@@ -0,0 +1,18 @@
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#pragma GCC target "+nosve"
+
+#include <stdint.h>
+
+void
+f (int16_t *x, int16_t *y, int8_t *z, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[i] = z[i];
+      y[i] += y[i - 8];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tsxtl\tv[0-9]+\.8h, v[0-9]+\.8b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.8h,} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_8.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_8.c
new file mode 100644
index 00000000000..9531966c294
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_8.c
@@ -0,0 +1,18 @@
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#pragma GCC target "+nosve"
+
+#include <stdint.h>
+
+void
+f (int64_t *x, int64_t *y, uint32_t *z, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[i] = z[i];
+      y[i] += y[i - 2];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tuxtl\tv[0-9]+\.2d, v[0-9]+\.2s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.2d,} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_9.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_9.c
new file mode 100644
index 00000000000..de8f6988685
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_9.c
@@ -0,0 +1,18 @@
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#pragma GCC target "+nosve"
+
+#include <stdint.h>
+
+void
+f (int32_t *x, int32_t *y, uint16_t *z, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[i] = z[i];
+      y[i] += y[i - 4];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tuxtl\tv[0-9]+\.4s, v[0-9]+\.4h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.4s,} 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-19.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-19.c
index ae2f8611ea6..9d926ca5dfe 100644
--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-19.c
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-19.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-options "-O3 -mavx -mtune=generic -dp" } */
 
diff --git a/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c b/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c
index 2a105601c71..51765900fcf 100644
--- a/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c
+++ b/gcc/testsuite/gcc.target/i386/avx2-vect-mask-store-move1.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-options "-O3 -mavx2 -fdump-tree-vect-details" } */
 /* { dg-require-effective-target avx2 } */
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-gather-2.c b/gcc/testsuite/gcc.target/i386/avx512f-gather-2.c
index a26aa6529e8..4de04511934 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-gather-2.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-gather-2.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */ /* PR59617 */
 /* { dg-options "-O3 -mavx512f -fdump-tree-vect-details -mtune=knl" } */
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-gather-5.c b/gcc/testsuite/gcc.target/i386/avx512f-gather-5.c
index 2bb9c5c090b..946117d9d30 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-gather-5.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-gather-5.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-options "-O3 -mavx512f -mtune=knl" } */
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-simd-1.c b/gcc/testsuite/gcc.target/i386/avx512f-simd-1.c
new file mode 100644
index 00000000000..235fb917e17
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-simd-1.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-fopenmp-simd -O2 -mavx512f -masm=att" } */
+/* { dg-final { scan-assembler "vpadd\[^\n\r]*%xmm" } } */
+/* { dg-final { scan-assembler "vpadd\[^\n\r]*%ymm" } } */
+/* { dg-final { scan-assembler "vpadd\[^\n\r]*%zmm" } } */
+
+#define N 1024
+int a[N];
+
+void
+f1 (void)
+{
+  int i;
+  #pragma omp simd simdlen (4)
+  for (i = 0; i < N; ++i)
+    a[i] = a[i] + 1;
+}
+
+void
+f2 (void)
+{
+  int i;
+  #pragma omp simd simdlen (8)
+  for (i = 0; i < N; ++i)
+    a[i] = a[i] + 2;
+}
+
+void
+f3 (void)
+{
+  int i;
+  #pragma omp simd simdlen (16)
+  for (i = 0; i < N; ++i)
+    a[i] = a[i] + 3;
+}
diff --git a/gcc/testsuite/gcc.target/i386/l_fma_double_1.c b/gcc/testsuite/gcc.target/i386/l_fma_double_1.c
index e5bcdabcf79..2472fb016ee 100644
--- a/gcc/testsuite/gcc.target/i386/l_fma_double_1.c
+++ b/gcc/testsuite/gcc.target/i386/l_fma_double_1.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */
 
diff --git a/gcc/testsuite/gcc.target/i386/l_fma_double_2.c b/gcc/testsuite/gcc.target/i386/l_fma_double_2.c
index dbd078abc81..3d569733b1e 100644
--- a/gcc/testsuite/gcc.target/i386/l_fma_double_2.c
+++ b/gcc/testsuite/gcc.target/i386/l_fma_double_2.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */
 
diff --git a/gcc/testsuite/gcc.target/i386/l_fma_double_3.c b/gcc/testsuite/gcc.target/i386/l_fma_double_3.c
index d0844f208e5..8e5ec4150cc 100644
--- a/gcc/testsuite/gcc.target/i386/l_fma_double_3.c
+++ b/gcc/testsuite/gcc.target/i386/l_fma_double_3.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */
 
diff --git a/gcc/testsuite/gcc.target/i386/l_fma_double_4.c b/gcc/testsuite/gcc.target/i386/l_fma_double_4.c
index b9498a0ff13..0d2a0408d0b 100644
--- a/gcc/testsuite/gcc.target/i386/l_fma_double_4.c
+++ b/gcc/testsuite/gcc.target/i386/l_fma_double_4.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */
 
diff --git a/gcc/testsuite/gcc.target/i386/l_fma_double_5.c b/gcc/testsuite/gcc.target/i386/l_fma_double_5.c
index 0292ba040a3..fcf1a6ceac1 100644
--- a/gcc/testsuite/gcc.target/i386/l_fma_double_5.c
+++ b/gcc/testsuite/gcc.target/i386/l_fma_double_5.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */
 
diff --git a/gcc/testsuite/gcc.target/i386/l_fma_double_6.c b/gcc/testsuite/gcc.target/i386/l_fma_double_6.c
index a716006eda8..650e608117f 100644
--- a/gcc/testsuite/gcc.target/i386/l_fma_double_6.c
+++ b/gcc/testsuite/gcc.target/i386/l_fma_double_6.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */
 
diff --git a/gcc/testsuite/gcc.target/i386/l_fma_float_1.c b/gcc/testsuite/gcc.target/i386/l_fma_float_1.c
index b386b83e39a..c29198ba666 100644
--- a/gcc/testsuite/gcc.target/i386/l_fma_float_1.c
+++ b/gcc/testsuite/gcc.target/i386/l_fma_float_1.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */
 
diff --git a/gcc/testsuite/gcc.target/i386/l_fma_float_2.c b/gcc/testsuite/gcc.target/i386/l_fma_float_2.c
index 81193b2d8b1..cb38b77344f 100644
--- a/gcc/testsuite/gcc.target/i386/l_fma_float_2.c
+++ b/gcc/testsuite/gcc.target/i386/l_fma_float_2.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */
 
diff --git a/gcc/testsuite/gcc.target/i386/l_fma_float_3.c b/gcc/testsuite/gcc.target/i386/l_fma_float_3.c
index d86cb904357..10a350e9e10 100644
--- a/gcc/testsuite/gcc.target/i386/l_fma_float_3.c
+++ b/gcc/testsuite/gcc.target/i386/l_fma_float_3.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */
 
diff --git a/gcc/testsuite/gcc.target/i386/l_fma_float_4.c b/gcc/testsuite/gcc.target/i386/l_fma_float_4.c
index 68ca8388d70..020e5d86f35 100644
--- a/gcc/testsuite/gcc.target/i386/l_fma_float_4.c
+++ b/gcc/testsuite/gcc.target/i386/l_fma_float_4.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */
 
diff --git a/gcc/testsuite/gcc.target/i386/l_fma_float_5.c b/gcc/testsuite/gcc.target/i386/l_fma_float_5.c
index 4db4749c024..3ff23c17aab 100644
--- a/gcc/testsuite/gcc.target/i386/l_fma_float_5.c
+++ b/gcc/testsuite/gcc.target/i386/l_fma_float_5.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */
 
diff --git a/gcc/testsuite/gcc.target/i386/l_fma_float_6.c b/gcc/testsuite/gcc.target/i386/l_fma_float_6.c
index 0b86e6256bd..34671baa28a 100644
--- a/gcc/testsuite/gcc.target/i386/l_fma_float_6.c
+++ b/gcc/testsuite/gcc.target/i386/l_fma_float_6.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-options "-O3 -Wno-attributes -mfpmath=sse -mfma -mtune=generic -mno-fma4" } */
 
diff --git a/gcc/testsuite/gcc.target/i386/mask-pack.c b/gcc/testsuite/gcc.target/i386/mask-pack.c
index 0b564ef4284..a607dfb460c 100644
--- a/gcc/testsuite/gcc.target/i386/mask-pack.c
+++ b/gcc/testsuite/gcc.target/i386/mask-pack.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-options "-mavx512bw -O3 -fopenmp-simd -fdump-tree-vect-details" } */
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 10 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/mask-unpack.c b/gcc/testsuite/gcc.target/i386/mask-unpack.c
index 4291480cfff..ca71ea2e29d 100644
--- a/gcc/testsuite/gcc.target/i386/mask-unpack.c
+++ b/gcc/testsuite/gcc.target/i386/mask-unpack.c
@@ -1,3 +1,5 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-do compile } */
 /* { dg-options "-mavx512bw -mavx512dq -mno-stackrealign -O3 -fopenmp-simd -fdump-tree-vect-details" } */
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 10 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90358.c b/gcc/testsuite/gcc.target/i386/pr90358.c
new file mode 100644
index 00000000000..4894fdbd079
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90358.c
@@ -0,0 +1,35 @@
+/* PR target/90358 */
+/* { dg-do run { target { sse4_runtime } } } */
+/* { dg-options "-O3 -msse4" } */
+
+struct s { unsigned int a, b, c; };
+
+void __attribute__ ((noipa))
+foo (struct s *restrict s1, struct s *restrict s2, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      s1[i].b = s2[i].b;
+      s1[i].c = s2[i].c;
+      s2[i].c = 0;
+    }
+}
+                            
+#define N 12
+
+int
+main ()
+{
+  struct s s1[N], s2[N];
+  for (unsigned int j = 0; j < N; ++j)
+    {
+      s2[j].a = j * 5;
+      s2[j].b = j * 5 + 2;
+      s2[j].c = j * 5 + 4;
+    }
+  foo (s1, s2, N);
+  for (unsigned int j = 0; j < N; ++j)
+  if (s1[j].b != j * 5 + 2)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr91033.c b/gcc/testsuite/gcc.target/i386/pr91033.c
new file mode 100644
index 00000000000..43d99d5a7dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr91033.c
@@ -0,0 +1,15 @@
+/* PR tree-optimization/91033 */
+/* { dg-do compile { target pthread } } */
+/* { dg-options "-march=knl -O2 -fopenmp-simd -ftree-parallelize-loops=2" } */
+
+#define N 1024
+int a[N];
+
+void
+foo (void)
+{
+  int i;
+  #pragma omp simd simdlen (4)
+  for (i = 0; i < N; ++i)
+    a[i] = a[i] + 1;
+}
diff --git a/gcc/testsuite/gfortran.dg/vect/vect-4.f90 b/gcc/testsuite/gfortran.dg/vect/vect-4.f90
index b567cbd8644..c2eeafd3900 100644
--- a/gcc/testsuite/gfortran.dg/vect/vect-4.f90
+++ b/gcc/testsuite/gfortran.dg/vect/vect-4.f90
@@ -1,3 +1,5 @@
+! Disabling epilogues until we find a better way to deal with scans.
+! { dg-additional-options "--param vect-epilogues-nomask=0" }
 ! { dg-do compile }
 ! { dg-require-effective-target vect_float }
 ! { dg-additional-options "--param vect-max-peeling-for-alignment=0" }
diff --git a/gcc/testsuite/gfortran.dg/vect/vect-8.f90 b/gcc/testsuite/gfortran.dg/vect/vect-8.f90
index 0ac5f1c390b..1c243308476 100644
--- a/gcc/testsuite/gfortran.dg/vect/vect-8.f90
+++ b/gcc/testsuite/gfortran.dg/vect/vect-8.f90
@@ -704,5 +704,6 @@ CALL track('KERNEL  ')
 RETURN
 END SUBROUTINE kernel
 
-! { dg-final { scan-tree-dump-times "vectorized 22 loops" 1 "vect" { target vect_intdouble_cvt } } }
-! { dg-final { scan-tree-dump-times "vectorized 17 loops" 1 "vect" { target { ! vect_intdouble_cvt } } } }
+! { dg-final { scan-tree-dump-times "vectorized 23 loops" 1 "vect" { target aarch64*-*-* } } }
+! { dg-final { scan-tree-dump-times "vectorized 22 loops" 1 "vect" { target { vect_intdouble_cvt && { ! aarch64*-*-* } } } } }
+! { dg-final { scan-tree-dump-times "vectorized 17 loops" 1 "vect" { target { { ! vect_intdouble_cvt } && { ! aarch64*-*-* } } } } }
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index f8aeec9bae8..621c8ea3dad 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -3557,6 +3557,24 @@ verify_gimple_assign_unary (gassign *stmt)
     {
     CASE_CONVERT:
       {
+	/* Allow conversions between vectors with the same number of elements,
+	   provided that the conversion is OK for the element types too.  */
+	if (VECTOR_TYPE_P (lhs_type)
+	    && VECTOR_TYPE_P (rhs1_type)
+	    && known_eq (TYPE_VECTOR_SUBPARTS (lhs_type),
+			 TYPE_VECTOR_SUBPARTS (rhs1_type)))
+	  {
+	    lhs_type = TREE_TYPE (lhs_type);
+	    rhs1_type = TREE_TYPE (rhs1_type);
+	  }
+	else if (VECTOR_TYPE_P (lhs_type) || VECTOR_TYPE_P (rhs1_type))
+	  {
+	    error ("invalid vector types in nop conversion");
+	    debug_generic_expr (lhs_type);
+	    debug_generic_expr (rhs1_type);
+	    return true;
+	  }
+
 	/* Allow conversions from pointer type to integral type only if
 	   there is no sign or zero extension involved.
 	   For targets were the precision of ptrofftype doesn't match that
diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c
index d00c1bd31e6..c95dd204870 100644
--- a/gcc/tree-data-ref.c
+++ b/gcc/tree-data-ref.c
@@ -1287,7 +1287,7 @@ create_data_ref (edge nest, loop_p loop, tree memref, gimple *stmt,
   return dr;
 }
 
-/*  A helper function computes order between two tree epxressions T1 and T2.
+/*  A helper function computes order between two tree expressions T1 and T2.
     This is used in comparator functions sorting objects based on the order
     of tree expressions.  The function returns -1, 0, or 1.  */
 
@@ -1454,6 +1454,54 @@ comp_dr_with_seg_len_pair (const void *pa_, const void *pb_)
   return 0;
 }
 
+/* Dump information about ALIAS_PAIR, indenting each line by INDENT.  */
+
+static void
+dump_alias_pair (dr_with_seg_len_pair_t *alias_pair, const char *indent)
+{
+  dump_printf (MSG_NOTE, "%sreference:      %T vs. %T\n", indent,
+	       DR_REF (alias_pair->first.dr),
+	       DR_REF (alias_pair->second.dr));
+
+  dump_printf (MSG_NOTE, "%ssegment length: %T", indent,
+	       alias_pair->first.seg_len);
+  if (!operand_equal_p (alias_pair->first.seg_len,
+			alias_pair->second.seg_len, 0))
+    dump_printf (MSG_NOTE, " vs. %T", alias_pair->second.seg_len);
+
+  dump_printf (MSG_NOTE, "\n%saccess size:    ", indent);
+  dump_dec (MSG_NOTE, alias_pair->first.access_size);
+  if (maybe_ne (alias_pair->first.access_size, alias_pair->second.access_size))
+    {
+      dump_printf (MSG_NOTE, " vs. ");
+      dump_dec (MSG_NOTE, alias_pair->second.access_size);
+    }
+
+  dump_printf (MSG_NOTE, "\n%salignment:      %d", indent,
+	       alias_pair->first.align);
+  if (alias_pair->first.align != alias_pair->second.align)
+    dump_printf (MSG_NOTE, " vs. %d", alias_pair->second.align);
+
+  dump_printf (MSG_NOTE, "\n%sflags:         ", indent);
+  if (alias_pair->flags & DR_ALIAS_RAW)
+    dump_printf (MSG_NOTE, " RAW");
+  if (alias_pair->flags & DR_ALIAS_WAR)
+    dump_printf (MSG_NOTE, " WAR");
+  if (alias_pair->flags & DR_ALIAS_WAW)
+    dump_printf (MSG_NOTE, " WAW");
+  if (alias_pair->flags & DR_ALIAS_ARBITRARY)
+    dump_printf (MSG_NOTE, " ARBITRARY");
+  if (alias_pair->flags & DR_ALIAS_SWAPPED)
+    dump_printf (MSG_NOTE, " SWAPPED");
+  if (alias_pair->flags & DR_ALIAS_UNSWAPPED)
+    dump_printf (MSG_NOTE, " UNSWAPPED");
+  if (alias_pair->flags & DR_ALIAS_MIXED_STEPS)
+    dump_printf (MSG_NOTE, " MIXED_STEPS");
+  if (alias_pair->flags == 0)
+    dump_printf (MSG_NOTE, " <none>");
+  dump_printf (MSG_NOTE, "\n");
+}
+
 /* Merge alias checks recorded in ALIAS_PAIRS and remove redundant ones.
    FACTOR is number of iterations that each data reference is accessed.
 
@@ -1488,19 +1536,50 @@ void
 prune_runtime_alias_test_list (vec<dr_with_seg_len_pair_t> *alias_pairs,
 			       poly_uint64)
 {
+  if (alias_pairs->is_empty ())
+    return;
+
+  /* Canonicalize each pair so that the base components are ordered wrt
+     data_ref_compare_tree.  This allows the loop below to merge more
+     cases.  */
+  unsigned int i;
+  dr_with_seg_len_pair_t *alias_pair;
+  FOR_EACH_VEC_ELT (*alias_pairs, i, alias_pair)
+    {
+      data_reference_p dr_a = alias_pair->first.dr;
+      data_reference_p dr_b = alias_pair->second.dr;
+      int comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
+					    DR_BASE_ADDRESS (dr_b));
+      if (comp_res == 0)
+	comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
+      if (comp_res == 0)
+	comp_res = data_ref_compare_tree (DR_INIT (dr_a), DR_INIT (dr_b));
+      if (comp_res > 0)
+	{
+	  std::swap (alias_pair->first, alias_pair->second);
+	  alias_pair->flags |= DR_ALIAS_SWAPPED;
+	}
+      else
+	alias_pair->flags |= DR_ALIAS_UNSWAPPED;
+    }
+
   /* Sort the collected data ref pairs so that we can scan them once to
      combine all possible aliasing checks.  */
   alias_pairs->qsort (comp_dr_with_seg_len_pair);
 
   /* Scan the sorted dr pairs and check if we can combine alias checks
      of two neighboring dr pairs.  */
-  for (size_t i = 1; i < alias_pairs->length (); ++i)
+  unsigned int last = 0;
+  for (i = 1; i < alias_pairs->length (); ++i)
     {
       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
-      dr_with_seg_len *dr_a1 = &(*alias_pairs)[i-1].first,
-		      *dr_b1 = &(*alias_pairs)[i-1].second,
-		      *dr_a2 = &(*alias_pairs)[i].first,
-		      *dr_b2 = &(*alias_pairs)[i].second;
+      dr_with_seg_len_pair_t *alias_pair1 = &(*alias_pairs)[last];
+      dr_with_seg_len_pair_t *alias_pair2 = &(*alias_pairs)[i];
+
+      dr_with_seg_len *dr_a1 = &alias_pair1->first;
+      dr_with_seg_len *dr_b1 = &alias_pair1->second;
+      dr_with_seg_len *dr_a2 = &alias_pair2->first;
+      dr_with_seg_len *dr_b2 = &alias_pair2->second;
 
       /* Remove duplicate data ref pairs.  */
       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
@@ -1509,10 +1588,16 @@ prune_runtime_alias_test_list (vec<dr_with_seg_len_pair_t> *alias_pairs,
 	    dump_printf (MSG_NOTE, "found equal ranges %T, %T and %T, %T\n",
 			 DR_REF (dr_a1->dr), DR_REF (dr_b1->dr),
 			 DR_REF (dr_a2->dr), DR_REF (dr_b2->dr));
-	  alias_pairs->ordered_remove (i--);
+	  alias_pair1->flags |= alias_pair2->flags;
 	  continue;
 	}
 
+      /* Assume that we won't be able to merge the pairs, then correct
+	 if we do.  */
+      last += 1;
+      if (last != i)
+	(*alias_pairs)[last] = (*alias_pairs)[i];
+
       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
 	{
 	  /* We consider the case that DR_B1 and DR_B2 are same memrefs,
@@ -1538,13 +1623,6 @@ prune_runtime_alias_test_list (vec<dr_with_seg_len_pair_t> *alias_pairs,
 	  if (!ordered_p (init_a1, init_a2))
 	    continue;
 
-	  /* Make sure dr_a1 starts left of dr_a2.  */
-	  if (maybe_gt (init_a1, init_a2))
-	    {
-	      std::swap (*dr_a1, *dr_a2);
-	      std::swap (init_a1, init_a2);
-	    }
-
 	  /* Work out what the segment length would be if we did combine
 	     DR_A1 and DR_A2:
 
@@ -1561,7 +1639,10 @@ prune_runtime_alias_test_list (vec<dr_with_seg_len_pair_t> *alias_pairs,
 
 	     The lengths both have sizetype, so the sign is taken from
 	     the step instead.  */
-	  if (!operand_equal_p (dr_a1->seg_len, dr_a2->seg_len, 0))
+	  poly_uint64 new_seg_len = 0;
+	  bool new_seg_len_p = !operand_equal_p (dr_a1->seg_len,
+						 dr_a2->seg_len, 0);
+	  if (new_seg_len_p)
 	    {
 	      poly_uint64 seg_len_a1, seg_len_a2;
 	      if (!poly_int_tree_p (dr_a1->seg_len, &seg_len_a1)
@@ -1579,14 +1660,29 @@ prune_runtime_alias_test_list (vec<dr_with_seg_len_pair_t> *alias_pairs,
 	      int sign_a = tree_int_cst_sgn (indicator_a);
 	      int sign_b = tree_int_cst_sgn (indicator_b);
 
-	      poly_uint64 new_seg_len;
 	      if (sign_a <= 0 && sign_b <= 0)
 		new_seg_len = lower_bound (seg_len_a1, seg_len_a2);
 	      else if (sign_a >= 0 && sign_b >= 0)
 		new_seg_len = upper_bound (seg_len_a1, seg_len_a2);
 	      else
 		continue;
+	    }
+	  /* At this point we're committed to merging the refs.  */
 
+	  /* Make sure dr_a1 starts left of dr_a2.  */
+	  if (maybe_gt (init_a1, init_a2))
+	    {
+	      std::swap (*dr_a1, *dr_a2);
+	      std::swap (init_a1, init_a2);
+	    }
+
+	  /* The DR_Bs are equal, so only the DR_As can introduce
+	     mixed steps.  */
+	  if (!operand_equal_p (DR_STEP (dr_a1->dr), DR_STEP (dr_a2->dr), 0))
+	    alias_pair1->flags |= DR_ALIAS_MIXED_STEPS;
+
+	  if (new_seg_len_p)
+	    {
 	      dr_a1->seg_len = build_int_cst (TREE_TYPE (dr_a1->seg_len),
 					      new_seg_len);
 	      dr_a1->align = MIN (dr_a1->align, known_alignment (new_seg_len));
@@ -1608,17 +1704,40 @@ prune_runtime_alias_test_list (vec<dr_with_seg_len_pair_t> *alias_pairs,
 	    dump_printf (MSG_NOTE, "merging ranges for %T, %T and %T, %T\n",
 			 DR_REF (dr_a1->dr), DR_REF (dr_b1->dr),
 			 DR_REF (dr_a2->dr), DR_REF (dr_b2->dr));
-	  alias_pairs->ordered_remove (i);
-	  i--;
+	  alias_pair1->flags |= alias_pair2->flags;
+	  last -= 1;
 	}
     }
+  alias_pairs->truncate (last + 1);
+
+  /* Try to restore the original dr_with_seg_len order within each
+     dr_with_seg_len_pair_t.  If we ended up combining swapped and
+     unswapped pairs into the same check, we have to invalidate any
+     RAW, WAR and WAW information for it.  */
+  if (dump_enabled_p ())
+    dump_printf (MSG_NOTE, "merged alias checks:\n");
+  FOR_EACH_VEC_ELT (*alias_pairs, i, alias_pair)
+    {
+      unsigned int swap_mask = (DR_ALIAS_SWAPPED | DR_ALIAS_UNSWAPPED);
+      unsigned int swapped = (alias_pair->flags & swap_mask);
+      if (swapped == DR_ALIAS_SWAPPED)
+	std::swap (alias_pair->first, alias_pair->second);
+      else if (swapped != DR_ALIAS_UNSWAPPED)
+	alias_pair->flags |= DR_ALIAS_ARBITRARY;
+      alias_pair->flags &= ~swap_mask;
+      if (dump_enabled_p ())
+	dump_alias_pair (alias_pair, "  ");
+    }
 }
 
-/* Given LOOP's two data references and segment lengths described by DR_A
-   and DR_B, create expression checking if the two addresses ranges intersect
-   with each other based on index of the two addresses.  This can only be
-   done if DR_A and DR_B referring to the same (array) object and the index
-   is the only difference.  For example:
+/* Try to generate a runtime condition that is true if ALIAS_PAIR is
+   free of aliases, using a condition based on index values instead
+   of a condition based on addresses.  Return true on success,
+   storing the condition in *COND_EXPR.
+
+   This can only be done if the two data references in ALIAS_PAIR access
+   the same array object and the index is the only difference.  For example,
+   if the two data references are DR_A and DR_B:
 
                        DR_A                           DR_B
       data-ref         arr[i]                         arr[j]
@@ -1635,16 +1754,20 @@ prune_runtime_alias_test_list (vec<dr_with_seg_len_pair_t> *alias_pairs,
 
    We can create expression based on index rather than address:
 
-     (i_0 + 4 < j_0 || j_0 + 4 < i_0)
+     (unsigned) (i_0 - j_0 + 3) <= 6
+
+   i.e. the indices are less than 4 apart.
 
    Note evolution step of index needs to be considered in comparison.  */
 
 static bool
 create_intersect_range_checks_index (struct loop *loop, tree *cond_expr,
-				     const dr_with_seg_len& dr_a,
-				     const dr_with_seg_len& dr_b)
+				     const dr_with_seg_len_pair_t &alias_pair)
 {
-  if (integer_zerop (DR_STEP (dr_a.dr))
+  const dr_with_seg_len &dr_a = alias_pair.first;
+  const dr_with_seg_len &dr_b = alias_pair.second;
+  if ((alias_pair.flags & DR_ALIAS_MIXED_STEPS)
+      || integer_zerop (DR_STEP (dr_a.dr))
       || integer_zerop (DR_STEP (dr_b.dr))
       || DR_NUM_DIMENSIONS (dr_a.dr) != DR_NUM_DIMENSIONS (dr_b.dr))
     return false;
@@ -1670,15 +1793,8 @@ create_intersect_range_checks_index (struct loop *loop, tree *cond_expr,
   if (neg_step)
     {
       abs_step = -abs_step;
-      seg_len1 = -seg_len1;
-      seg_len2 = -seg_len2;
-    }
-  else
-    {
-      /* Include the access size in the length, so that we only have one
-	 tree addition below.  */
-      seg_len1 += dr_a.access_size;
-      seg_len2 += dr_b.access_size;
+      seg_len1 = (-wi::to_poly_wide (dr_a.seg_len)).force_uhwi ();
+      seg_len2 = (-wi::to_poly_wide (dr_b.seg_len)).force_uhwi ();
     }
 
   /* Infer the number of iterations with which the memory segment is accessed
@@ -1692,16 +1808,15 @@ create_intersect_range_checks_index (struct loop *loop, tree *cond_expr,
       || !can_div_trunc_p (seg_len2 + abs_step - 1, abs_step, &niter_len2))
     return false;
 
-  poly_uint64 niter_access1 = 0, niter_access2 = 0;
-  if (neg_step)
-    {
-      /* Divide each access size by the byte step, rounding up.  */
-      if (!can_div_trunc_p (dr_a.access_size - abs_step - 1,
-			    abs_step, &niter_access1)
-	  || !can_div_trunc_p (dr_b.access_size + abs_step - 1,
-			       abs_step, &niter_access2))
-	return false;
-    }
+  /* Divide each access size by the byte step, rounding up.  */
+  poly_uint64 niter_access1, niter_access2;
+  if (!can_div_trunc_p (dr_a.access_size + abs_step - 1,
+			abs_step, &niter_access1)
+      || !can_div_trunc_p (dr_b.access_size + abs_step - 1,
+			   abs_step, &niter_access2))
+    return false;
+
+  bool waw_or_war_p = (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW)) == 0;
 
   unsigned int i;
   for (i = 0; i < DR_NUM_DIMENSIONS (dr_a.dr); i++)
@@ -1741,44 +1856,298 @@ create_intersect_range_checks_index (struct loop *loop, tree *cond_expr,
 	 index of data reference.  Like segment length, index length is
 	 linear function of the number of iterations with index_step as
 	 the coefficient, i.e, niter_len * idx_step.  */
-      tree idx_len1 = fold_build2 (MULT_EXPR, TREE_TYPE (min1), idx_step,
-				   build_int_cst (TREE_TYPE (min1),
-						  niter_len1));
-      tree idx_len2 = fold_build2 (MULT_EXPR, TREE_TYPE (min2), idx_step,
-				   build_int_cst (TREE_TYPE (min2),
-						  niter_len2));
-      tree max1 = fold_build2 (PLUS_EXPR, TREE_TYPE (min1), min1, idx_len1);
-      tree max2 = fold_build2 (PLUS_EXPR, TREE_TYPE (min2), min2, idx_len2);
-      /* Adjust ranges for negative step.  */
+      offset_int abs_idx_step = offset_int::from (wi::to_wide (idx_step),
+						  SIGNED);
       if (neg_step)
-	{
-	  /* IDX_LEN1 and IDX_LEN2 are negative in this case.  */
-	  std::swap (min1, max1);
-	  std::swap (min2, max2);
-
-	  /* As with the lengths just calculated, we've measured the access
-	     sizes in iterations, so multiply them by the index step.  */
-	  tree idx_access1
-	    = fold_build2 (MULT_EXPR, TREE_TYPE (min1), idx_step,
-			   build_int_cst (TREE_TYPE (min1), niter_access1));
-	  tree idx_access2
-	    = fold_build2 (MULT_EXPR, TREE_TYPE (min2), idx_step,
-			   build_int_cst (TREE_TYPE (min2), niter_access2));
-
-	  /* MINUS_EXPR because the above values are negative.  */
-	  max1 = fold_build2 (MINUS_EXPR, TREE_TYPE (max1), max1, idx_access1);
-	  max2 = fold_build2 (MINUS_EXPR, TREE_TYPE (max2), max2, idx_access2);
-	}
-      tree part_cond_expr
-	= fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
-	    fold_build2 (LE_EXPR, boolean_type_node, max1, min2),
-	    fold_build2 (LE_EXPR, boolean_type_node, max2, min1));
+	abs_idx_step = -abs_idx_step;
+      poly_offset_int idx_len1 = abs_idx_step * niter_len1;
+      poly_offset_int idx_len2 = abs_idx_step * niter_len2;
+      poly_offset_int idx_access1 = abs_idx_step * niter_access1;
+      poly_offset_int idx_access2 = abs_idx_step * niter_access2;
+
+      gcc_assert (known_ge (idx_len1, 0)
+		  && known_ge (idx_len2, 0)
+		  && known_ge (idx_access1, 0)
+		  && known_ge (idx_access2, 0));
+
+      /* Each access has the following pattern, with lengths measured
+	 in units of INDEX:
+
+	      <-- idx_len -->
+	      <--- A: -ve step --->
+	      +-----+-------+-----+-------+-----+
+	      | n-1 | ..... |  0  | ..... | n-1 |
+	      +-----+-------+-----+-------+-----+
+			    <--- B: +ve step --->
+			    <-- idx_len -->
+			    |
+			   min
+
+	 where "n" is the number of scalar iterations covered by the segment
+	 and where each access spans idx_access units.
+
+	 A is the range of bytes accessed when the step is negative,
+	 B is the range when the step is positive.
+
+	 When checking for general overlap, we need to test whether
+	 the range:
+
+	   [min1 + low_offset1, min2 + high_offset1 + idx_access1 - 1]
+
+	 overlaps:
+
+	   [min2 + low_offset2, min2 + high_offset2 + idx_access2 - 1]
+
+	 where:
+
+	    low_offsetN = +ve step ? 0 : -idx_lenN;
+	   high_offsetN = +ve step ? idx_lenN : 0;
+
+	 This is equivalent to testing whether:
+
+	   min1 + low_offset1 <= min2 + high_offset2 + idx_access2 - 1
+	   && min2 + low_offset2 <= min1 + high_offset1 + idx_access1 - 1
+
+	 Converting this into a single test, there is an overlap if:
+
+	   0 <= min2 - min1 + bias <= limit
+
+	 where  bias = high_offset2 + idx_access2 - 1 - low_offset1
+	       limit = (high_offset1 - low_offset1 + idx_access1 - 1)
+		     + (high_offset2 - low_offset2 + idx_access2 - 1)
+	  i.e. limit = idx_len1 + idx_access1 - 1 + idx_len2 + idx_access2 - 1
+
+	 Combining the tests requires limit to be computable in an unsigned
+	 form of the index type; if it isn't, we fall back to the usual
+	 pointer-based checks.
+
+	 We can do better if DR_B is a write and if DR_A and DR_B are
+	 well-ordered in both the original and the new code (see the
+	 comment above the DR_ALIAS_* flags for details).  In this case
+	 we know that for each i in [0, n-1], the write performed by
+	 access i of DR_B occurs after access numbers j<=i of DR_A in
+	 both the original and the new code.  Any write or anti
+	 dependencies wrt those DR_A accesses are therefore maintained.
+
+	 We just need to make sure that each individual write in DR_B does not
+	 overlap any higher-indexed access in DR_A; such DR_A accesses happen
+	 after the DR_B access in the original code but happen before it in
+	 the new code.
+
+	 We know the steps for both accesses are equal, so by induction, we
+	 just need to test whether the first write of DR_B overlaps a later
+	 access of DR_A.  In other words, we need to move min1 along by
+	 one iteration:
+
+	   min1' = min1 + idx_step
+
+	 and use the ranges:
+
+	   [min1' + low_offset1', min1' + high_offset1' + idx_access1 - 1]
+
+	 and:
+
+	   [min2, min2 + idx_access2 - 1]
+
+	 where:
+
+	    low_offset1' = +ve step ? 0 : -(idx_len1 - |idx_step|)
+	   high_offset1' = +ve_step ? idx_len1 - |idx_step| : 0.  */
+      if (waw_or_war_p)
+	idx_len1 -= abs_idx_step;
+
+      poly_offset_int limit = idx_len1 + idx_access1 - 1 + idx_access2 - 1;
+      if (!waw_or_war_p)
+	limit += idx_len2;
+
+      tree utype = unsigned_type_for (TREE_TYPE (min1));
+      if (!wi::fits_to_tree_p (limit, utype))
+	return false;
+
+      poly_offset_int low_offset1 = neg_step ? -idx_len1 : 0;
+      poly_offset_int high_offset2 = neg_step || waw_or_war_p ? 0 : idx_len2;
+      poly_offset_int bias = high_offset2 + idx_access2 - 1 - low_offset1;
+      /* Equivalent to adding IDX_STEP to MIN1.  */
+      if (waw_or_war_p)
+	bias -= wi::to_offset (idx_step);
+
+      tree subject = fold_build2 (MINUS_EXPR, utype,
+				  fold_convert (utype, min2),
+				  fold_convert (utype, min1));
+      subject = fold_build2 (PLUS_EXPR, utype, subject,
+			     wide_int_to_tree (utype, bias));
+      tree part_cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject,
+					 wide_int_to_tree (utype, limit));
       if (*cond_expr)
 	*cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 				  *cond_expr, part_cond_expr);
       else
 	*cond_expr = part_cond_expr;
     }
+  if (dump_enabled_p ())
+    {
+      if (waw_or_war_p)
+	dump_printf (MSG_NOTE, "using an index-based WAR/WAW test\n");
+      else
+	dump_printf (MSG_NOTE, "using an index-based overlap test\n");
+    }
+  return true;
+}
+
+/* A subroutine of create_intersect_range_checks, with a subset of the
+   same arguments.  Try to optimize cases in which the second access
+   is a write and in which some overlap is valid.  */
+
+static bool
+create_waw_or_war_checks (tree *cond_expr,
+			  const dr_with_seg_len_pair_t &alias_pair)
+{
+  const dr_with_seg_len& dr_a = alias_pair.first;
+  const dr_with_seg_len& dr_b = alias_pair.second;
+
+  /* Check for cases in which:
+
+     (a) DR_B is always a write;
+     (b) the accesses are well-ordered in both the original and new code
+	 (see the comment above the DR_ALIAS_* flags for details); and
+     (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR.  */
+  if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
+    return false;
+
+  /* Check for equal (but possibly variable) steps.  */
+  tree step = DR_STEP (dr_a.dr);
+  if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
+    return false;
+
+  /* Make sure that we can operate on sizetype without loss of precision.  */
+  tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
+  if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
+    return false;
+
+  /* All addresses involved are known to have a common alignment ALIGN.
+     We can therefore subtract ALIGN from an exclusive endpoint to get
+     an inclusive endpoint.  In the best (and common) case, ALIGN is the
+     same as the access sizes of both DRs, and so subtracting ALIGN
+     cancels out the addition of an access size.  */
+  unsigned int align = MIN (dr_a.align, dr_b.align);
+  poly_uint64 last_chunk_a = dr_a.access_size - align;
+  poly_uint64 last_chunk_b = dr_b.access_size - align;
+
+  /* Get a boolean expression that is true when the step is negative.  */
+  tree indicator = dr_direction_indicator (dr_a.dr);
+  tree neg_step = fold_build2 (LT_EXPR, boolean_type_node,
+			       fold_convert (ssizetype, indicator),
+			       ssize_int (0));
+
+  /* Get lengths in sizetype.  */
+  tree seg_len_a
+    = fold_convert (sizetype, rewrite_to_non_trapping_overflow (dr_a.seg_len));
+  step = fold_convert (sizetype, rewrite_to_non_trapping_overflow (step));
+
+  /* Each access has the following pattern:
+
+	  <- |seg_len| ->
+	  <--- A: -ve step --->
+	  +-----+-------+-----+-------+-----+
+	  | n-1 | ..... |  0  | ..... | n-1 |
+	  +-----+-------+-----+-------+-----+
+			<--- B: +ve step --->
+			<- |seg_len| ->
+			|
+		   base address
+
+     where "n" is the number of scalar iterations covered by the segment.
+
+     A is the range of bytes accessed when the step is negative,
+     B is the range when the step is positive.
+
+     We know that DR_B is a write.  We also know (from checking that
+     DR_A and DR_B are well-ordered) that for each i in [0, n-1],
+     the write performed by access i of DR_B occurs after access numbers
+     j<=i of DR_A in both the original and the new code.  Any write or
+     anti dependencies wrt those DR_A accesses are therefore maintained.
+
+     We just need to make sure that each individual write in DR_B does not
+     overlap any higher-indexed access in DR_A; such DR_A accesses happen
+     after the DR_B access in the original code but happen before it in
+     the new code.
+
+     We know the steps for both accesses are equal, so by induction, we
+     just need to test whether the first write of DR_B overlaps a later
+     access of DR_A.  In other words, we need to move addr_a along by
+     one iteration:
+
+       addr_a' = addr_a + step
+
+     and check whether:
+
+       [addr_b, addr_b + last_chunk_b]
+
+     overlaps:
+
+       [addr_a' + low_offset_a, addr_a' + high_offset_a + last_chunk_a]
+
+     where [low_offset_a, high_offset_a] spans accesses [1, n-1].  I.e.:
+
+	low_offset_a = +ve step ? 0 : seg_len_a - step
+       high_offset_a = +ve step ? seg_len_a - step : 0
+
+     This is equivalent to testing whether:
+
+       addr_a' + low_offset_a <= addr_b + last_chunk_b
+       && addr_b <= addr_a' + high_offset_a + last_chunk_a
+
+     Converting this into a single test, there is an overlap if:
+
+       0 <= addr_b + last_chunk_b - addr_a' - low_offset_a <= limit
+
+     where limit = high_offset_a - low_offset_a + last_chunk_a + last_chunk_b
+
+     If DR_A is performed, limit + |step| - last_chunk_b is known to be
+     less than the size of the object underlying DR_A.  We also know
+     that last_chunk_b <= |step|; this is checked elsewhere if it isn't
+     guaranteed at compile time.  There can therefore be no overflow if
+     "limit" is calculated in an unsigned type with pointer precision.  */
+  tree addr_a = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_a.dr),
+					 DR_OFFSET (dr_a.dr));
+  addr_a = fold_build_pointer_plus (addr_a, DR_INIT (dr_a.dr));
+
+  tree addr_b = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_b.dr),
+					 DR_OFFSET (dr_b.dr));
+  addr_b = fold_build_pointer_plus (addr_b, DR_INIT (dr_b.dr));
+
+  /* Advance ADDR_A by one iteration and adjust the length to compensate.  */
+  addr_a = fold_build_pointer_plus (addr_a, step);
+  tree seg_len_a_minus_step = fold_build2 (MINUS_EXPR, sizetype,
+					   seg_len_a, step);
+  if (!CONSTANT_CLASS_P (seg_len_a_minus_step))
+    seg_len_a_minus_step = build1 (SAVE_EXPR, sizetype, seg_len_a_minus_step);
+
+  tree low_offset_a = fold_build3 (COND_EXPR, sizetype, neg_step,
+				   seg_len_a_minus_step, size_zero_node);
+  if (!CONSTANT_CLASS_P (low_offset_a))
+    low_offset_a = build1 (SAVE_EXPR, sizetype, low_offset_a);
+
+  /* We could use COND_EXPR <neg_step, size_zero_node, seg_len_a_minus_step>,
+     but it's usually more efficient to reuse the LOW_OFFSET_A result.  */
+  tree high_offset_a = fold_build2 (MINUS_EXPR, sizetype, seg_len_a_minus_step,
+				    low_offset_a);
+
+  /* The amount added to addr_b - addr_a'.  */
+  tree bias = fold_build2 (MINUS_EXPR, sizetype,
+			   size_int (last_chunk_b), low_offset_a);
+
+  tree limit = fold_build2 (MINUS_EXPR, sizetype, high_offset_a, low_offset_a);
+  limit = fold_build2 (PLUS_EXPR, sizetype, limit,
+		       size_int (last_chunk_a + last_chunk_b));
+
+  tree subject = fold_build2 (POINTER_DIFF_EXPR, ssizetype, addr_b, addr_a);
+  subject = fold_build2 (PLUS_EXPR, sizetype,
+			 fold_convert (sizetype, subject), bias);
+
+  *cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject, limit);
+  if (dump_enabled_p ())
+    dump_printf (MSG_NOTE, "using an address-based WAR/WAW test\n");
   return true;
 }
 
@@ -1866,24 +2235,29 @@ get_segment_min_max (const dr_with_seg_len &d, tree *seg_min_out,
   *seg_max_out = fold_build_pointer_plus (addr_base, max_reach);
 }
 
-/* Given two data references and segment lengths described by DR_A and DR_B,
-   create expression checking if the two addresses ranges intersect with
-   each other:
+/* Generate a runtime condition that is true if ALIAS_PAIR is free of aliases,
+   storing the condition in *COND_EXPR.  The fallback is to generate a
+   a test that the two accesses do not overlap:
 
-     ((DR_A_addr_0 + DR_A_segment_length_0) <= DR_B_addr_0)
-     || (DR_B_addr_0 + DER_B_segment_length_0) <= DR_A_addr_0))  */
+     end_a <= start_b || end_b <= start_a.  */
 
 static void
 create_intersect_range_checks (struct loop *loop, tree *cond_expr,
-			       const dr_with_seg_len& dr_a,
-			       const dr_with_seg_len& dr_b)
+			       const dr_with_seg_len_pair_t &alias_pair)
 {
+  const dr_with_seg_len& dr_a = alias_pair.first;
+  const dr_with_seg_len& dr_b = alias_pair.second;
   *cond_expr = NULL_TREE;
-  if (create_intersect_range_checks_index (loop, cond_expr, dr_a, dr_b))
+  if (create_intersect_range_checks_index (loop, cond_expr, alias_pair))
+    return;
+
+  if (create_waw_or_war_checks (cond_expr, alias_pair))
     return;
 
   unsigned HOST_WIDE_INT min_align;
   tree_code cmp_code;
+  /* We don't have to check DR_ALIAS_MIXED_STEPS here, since both versions
+     are equivalent.  This is just an optimization heuristic.  */
   if (TREE_CODE (DR_STEP (dr_a.dr)) == INTEGER_CST
       && TREE_CODE (DR_STEP (dr_b.dr)) == INTEGER_CST)
     {
@@ -1924,6 +2298,8 @@ create_intersect_range_checks (struct loop *loop, tree *cond_expr,
     = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
 	fold_build2 (cmp_code, boolean_type_node, seg_a_max, seg_b_min),
 	fold_build2 (cmp_code, boolean_type_node, seg_b_max, seg_a_min));
+  if (dump_enabled_p ())
+    dump_printf (MSG_NOTE, "using an address-based overlap test\n");
 }
 
 /* Create a conditional expression that represents the run-time checks for
@@ -1940,18 +2316,19 @@ create_runtime_alias_checks (struct loop *loop,
   tree part_cond_expr;
 
   fold_defer_overflow_warnings ();
-  for (size_t i = 0, s = alias_pairs->length (); i < s; ++i)
+  dr_with_seg_len_pair_t *alias_pair;
+  unsigned int i;
+  FOR_EACH_VEC_ELT (*alias_pairs, i, alias_pair)
     {
-      const dr_with_seg_len& dr_a = (*alias_pairs)[i].first;
-      const dr_with_seg_len& dr_b = (*alias_pairs)[i].second;
-
+      gcc_assert (alias_pair->flags);
       if (dump_enabled_p ())
 	dump_printf (MSG_NOTE,
 		     "create runtime check for data references %T and %T\n",
-		     DR_REF (dr_a.dr), DR_REF (dr_b.dr));
+		     DR_REF (alias_pair->first.dr),
+		     DR_REF (alias_pair->second.dr));
 
       /* Create condition expression for each pair data references.  */
-      create_intersect_range_checks (loop, &part_cond_expr, dr_a, dr_b);
+      create_intersect_range_checks (loop, &part_cond_expr, *alias_pair);
       if (*cond_expr)
 	*cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 				  *cond_expr, part_cond_expr);
diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h
index 70cbb03b49c..9cb48a2ea3e 100644
--- a/gcc/tree-data-ref.h
+++ b/gcc/tree-data-ref.h
@@ -221,19 +221,113 @@ struct dr_with_seg_len
   unsigned int align;
 };
 
+/* Flags that describe a potential alias between two dr_with_seg_lens.
+   In general, each pair of dr_with_seg_lens represents a composite of
+   multiple access pairs P, so testing flags like DR_IS_READ on the DRs
+   does not give meaningful information.
+
+   DR_ALIAS_RAW:
+	There is a pair in P for which the second reference is a read
+	and the first is a write.
+
+   DR_ALIAS_WAR:
+	There is a pair in P for which the second reference is a write
+	and the first is a read.
+
+   DR_ALIAS_WAW:
+	There is a pair in P for which both references are writes.
+
+   DR_ALIAS_ARBITRARY:
+	Either
+	(a) it isn't possible to classify one pair in P as RAW, WAW or WAR; or
+	(b) there is a pair in P that breaks the ordering assumption below.
+
+	This flag overrides the RAW, WAR and WAW flags above.
+
+   DR_ALIAS_UNSWAPPED:
+   DR_ALIAS_SWAPPED:
+	Temporary flags that indicate whether there is a pair P whose
+	DRs have or haven't been swapped around.
+
+   DR_ALIAS_MIXED_STEPS:
+	The DR_STEP for one of the data references in the pair does not
+	accurately describe that reference for all members of P.  (Note
+	that the flag does not say anything about whether the DR_STEPs
+	of the two references in the pair are the same.)
+
+   The ordering assumption mentioned above is that for every pair
+   (DR_A, DR_B) in P:
+
+   (1) The original code accesses n elements for DR_A and n elements for DR_B,
+       interleaved as follows:
+
+	 one access of size DR_A.access_size at DR_A.dr
+	 one access of size DR_B.access_size at DR_B.dr
+	 one access of size DR_A.access_size at DR_A.dr + STEP_A
+	 one access of size DR_B.access_size at DR_B.dr + STEP_B
+	 one access of size DR_A.access_size at DR_A.dr + STEP_A * 2
+	 one access of size DR_B.access_size at DR_B.dr + STEP_B * 2
+	 ...
+
+   (2) The new code accesses the same data in exactly two chunks:
+
+	 one group of accesses spanning |DR_A.seg_len| + DR_A.access_size
+	 one group of accesses spanning |DR_B.seg_len| + DR_B.access_size
+
+   A pair might break this assumption if the DR_A and DR_B accesses
+   in the original or the new code are mingled in some way.  For example,
+   if DR_A.access_size represents the effect of two individual writes
+   to nearby locations, the pair breaks the assumption if those writes
+   occur either side of the access for DR_B.
+
+   Note that DR_ALIAS_ARBITRARY describes whether the ordering assumption
+   fails to hold for any individual pair in P.  If the assumption *does*
+   hold for every pair in P, it doesn't matter whether it holds for the
+   composite pair or not.  In other words, P should represent the complete
+   set of pairs that the composite pair is testing, so only the ordering
+   of two accesses in the same member of P matters.  */
+const unsigned int DR_ALIAS_RAW = 1U << 0;
+const unsigned int DR_ALIAS_WAR = 1U << 1;
+const unsigned int DR_ALIAS_WAW = 1U << 2;
+const unsigned int DR_ALIAS_ARBITRARY = 1U << 3;
+const unsigned int DR_ALIAS_SWAPPED = 1U << 4;
+const unsigned int DR_ALIAS_UNSWAPPED = 1U << 5;
+const unsigned int DR_ALIAS_MIXED_STEPS = 1U << 6;
+
 /* This struct contains two dr_with_seg_len objects with aliasing data
    refs.  Two comparisons are generated from them.  */
 
 struct dr_with_seg_len_pair_t
 {
-  dr_with_seg_len_pair_t (const dr_with_seg_len& d1,
-			       const dr_with_seg_len& d2)
-    : first (d1), second (d2) {}
+  /* WELL_ORDERED indicates that the ordering assumption described above
+     DR_ALIAS_ARBITRARY holds.  REORDERED indicates that it doesn't.  */
+  enum sequencing { WELL_ORDERED, REORDERED };
+
+  dr_with_seg_len_pair_t (const dr_with_seg_len &,
+			  const dr_with_seg_len &, sequencing);
 
   dr_with_seg_len first;
   dr_with_seg_len second;
+  unsigned int flags;
 };
 
+inline dr_with_seg_len_pair_t::
+dr_with_seg_len_pair_t (const dr_with_seg_len &d1, const dr_with_seg_len &d2,
+			sequencing seq)
+  : first (d1), second (d2), flags (0)
+{
+  if (DR_IS_READ (d1.dr) && DR_IS_WRITE (d2.dr))
+    flags |= DR_ALIAS_WAR;
+  else if (DR_IS_WRITE (d1.dr) && DR_IS_READ (d2.dr))
+    flags |= DR_ALIAS_RAW;
+  else if (DR_IS_WRITE (d1.dr) && DR_IS_WRITE (d2.dr))
+    flags |= DR_ALIAS_WAW;
+  else
+    gcc_unreachable ();
+  if (seq == REORDERED)
+    flags |= DR_ALIAS_ARBITRARY;
+}
+
 enum data_dependence_direction {
   dir_positive,
   dir_negative,
diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
index 2780a4b243f..bd946e14eb6 100644
--- a/gcc/tree-if-conv.c
+++ b/gcc/tree-if-conv.c
@@ -120,6 +120,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "fold-const.h"
 #include "tree-ssa-sccvn.h"
 #include "tree-cfgcleanup.h"
+#include "tree-ssa-dse.h"
 
 /* Only handle PHIs with no more arguments unless we are asked to by
    simd pragma.  */
@@ -2884,7 +2885,7 @@ ifcvt_split_critical_edges (struct loop *loop, bool aggressive_if_conv)
    loop vectorization.  */
 
 static void
-ifcvt_local_dce (basic_block bb)
+ifcvt_local_dce (class loop *loop)
 {
   gimple *stmt;
   gimple *stmt1;
@@ -2901,6 +2902,10 @@ ifcvt_local_dce (basic_block bb)
     replace_uses_by (name_pair->first, name_pair->second);
   redundant_ssa_names.release ();
 
+  /* The loop has a single BB only.  */
+  basic_block bb = loop->header;
+  tree latch_vdef = NULL_TREE;
+
   worklist.create (64);
   /* Consider all phi as live statements.  */
   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
@@ -2908,6 +2913,8 @@ ifcvt_local_dce (basic_block bb)
       phi = gsi_stmt (gsi);
       gimple_set_plf (phi, GF_PLF_2, true);
       worklist.safe_push (phi);
+      if (virtual_operand_p (gimple_phi_result (phi)))
+	latch_vdef = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
     }
   /* Consider load/store statements, CALL and COND as live.  */
   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
@@ -2971,6 +2978,19 @@ ifcvt_local_dce (basic_block bb)
   while (!gsi_end_p (gsi))
     {
       stmt = gsi_stmt (gsi);
+      if (gimple_store_p (stmt))
+	{
+	  tree lhs = gimple_get_lhs (stmt);
+	  ao_ref write;
+	  ao_ref_init (&write, lhs);
+
+          if (dse_classify_store (&write, stmt, false, NULL, NULL, latch_vdef)
+              == DSE_STORE_DEAD)
+            delete_dead_or_redundant_assignment (&gsi, "dead");
+	  gsi_next (&gsi);
+	  continue;
+	}
+
       if (gimple_plf (stmt, GF_PLF_2))
 	{
 	  gsi_next (&gsi);
@@ -3071,9 +3091,6 @@ tree_if_conversion (struct loop *loop, vec<gimple *> *preds)
      on-the-fly.  */
   combine_blocks (loop);
 
-  /* Delete dead predicate computations.  */
-  ifcvt_local_dce (loop->header);
-
   /* Perform local CSE, this esp. helps the vectorizer analysis if loads
      and stores are involved.  CSE only the loop body, not the entry
      PHIs, those are to be kept in sync with the non-if-converted copy.
@@ -3082,6 +3099,9 @@ tree_if_conversion (struct loop *loop, vec<gimple *> *preds)
   bitmap_set_bit (exit_bbs, single_exit (loop)->dest->index);
   bitmap_set_bit (exit_bbs, loop->latch->index);
   todo |= do_rpo_vn (cfun, loop_preheader_edge (loop), exit_bbs);
+
+  /* Delete dead predicate computations.  */
+  ifcvt_local_dce (loop);
   BITMAP_FREE (exit_bbs);
 
   todo |= TODO_cleanup_cfg;
diff --git a/gcc/tree-inline.c b/gcc/tree-inline.c
index d115fcb1a5b..2fbcd6e3e46 100644
--- a/gcc/tree-inline.c
+++ b/gcc/tree-inline.c
@@ -6201,11 +6201,11 @@ tree_function_versioning (tree old_decl, tree new_decl,
 	     in the debug info that var (whole DECL_ORIGIN is the parm
 	     PARM_DECL) is optimized away, but could be looked up at the
 	     call site as value of D#X there.  */
-	  tree var = vars, vexpr;
+	  tree vexpr;
 	  gimple_stmt_iterator cgsi
 	    = gsi_after_labels (single_succ (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
 	  gimple *def_temp;
-	  var = vars;
+	  tree var = vars;
 	  i = vec_safe_length (*debug_args);
 	  do
 	    {
diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c
index 8959f52a67b..a002bcd57b2 100644
--- a/gcc/tree-loop-distribution.c
+++ b/gcc/tree-loop-distribution.c
@@ -2445,12 +2445,6 @@ compute_alias_check_pairs (struct loop *loop, vec<ddr_p> *alias_ddrs,
       struct data_reference *dr_a = DDR_A (ddr);
       struct data_reference *dr_b = DDR_B (ddr);
       tree seg_length_a, seg_length_b;
-      int comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
-					    DR_BASE_ADDRESS (dr_b));
-
-      if (comp_res == 0)
-	comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
-      gcc_assert (comp_res != 0);
 
       if (latch_dominated_by_data_ref (loop, dr_a))
 	seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
@@ -2471,11 +2465,9 @@ compute_alias_check_pairs (struct loop *loop, vec<ddr_p> *alias_ddrs,
 
       dr_with_seg_len_pair_t dr_with_seg_len_pair
 	(dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a),
-	 dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b));
-
-      /* Canonicalize pairs by sorting the two DR members.  */
-      if (comp_res > 0)
-	std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
+	 dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b),
+	 /* ??? Would WELL_ORDERED be safe?  */
+	 dr_with_seg_len_pair_t::REORDERED);
 
       comp_alias_pairs->safe_push (dr_with_seg_len_pair);
     }
diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c
index dad6e2884db..e841da66db5 100644
--- a/gcc/tree-parloops.c
+++ b/gcc/tree-parloops.c
@@ -88,7 +88,8 @@ along with GCC; see the file COPYING3.  If not see
    More info can also be found at http://gcc.gnu.org/wiki/AutoParInGCC  */
 /*
   Reduction handling:
-  currently we use vect_force_simple_reduction() to detect reduction patterns.
+  currently we use code inspired by vect_force_simple_reduction to detect
+  reduction patterns.
   The code transformation will be introduced by an example.
 
 
@@ -182,6 +183,717 @@ parloop
 
 */
 
+/* Error reporting helper for parloops_is_simple_reduction below.  GIMPLE
+   statement STMT is printed with a message MSG. */
+
+static void
+report_ploop_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
+{
+  dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
+}
+
+/* DEF_STMT_INFO occurs in a loop that contains a potential reduction
+   operation.  Return true if the results of DEF_STMT_INFO are something
+   that can be accumulated by such a reduction.  */
+
+static bool
+parloops_valid_reduction_input_p (stmt_vec_info def_stmt_info)
+{
+  return (is_gimple_assign (def_stmt_info->stmt)
+	  || is_gimple_call (def_stmt_info->stmt)
+	  || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
+	  || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
+	      && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
+	      && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
+}
+
+/* Detect SLP reduction of the form:
+
+   #a1 = phi <a5, a0>
+   a2 = operation (a1)
+   a3 = operation (a2)
+   a4 = operation (a3)
+   a5 = operation (a4)
+
+   #a = phi <a5>
+
+   PHI is the reduction phi node (#a1 = phi <a5, a0> above)
+   FIRST_STMT is the first reduction stmt in the chain
+   (a2 = operation (a1)).
+
+   Return TRUE if a reduction chain was detected.  */
+
+static bool
+parloops_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
+			   gimple *first_stmt)
+{
+  class loop *loop = (gimple_bb (phi))->loop_father;
+  class loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
+  enum tree_code code;
+  gimple *loop_use_stmt = NULL;
+  stmt_vec_info use_stmt_info;
+  tree lhs;
+  imm_use_iterator imm_iter;
+  use_operand_p use_p;
+  int nloop_uses, size = 0, n_out_of_loop_uses;
+  bool found = false;
+
+  if (loop != vect_loop)
+    return false;
+
+  auto_vec<stmt_vec_info, 8> reduc_chain;
+  lhs = PHI_RESULT (phi);
+  code = gimple_assign_rhs_code (first_stmt);
+  while (1)
+    {
+      nloop_uses = 0;
+      n_out_of_loop_uses = 0;
+      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
+        {
+	  gimple *use_stmt = USE_STMT (use_p);
+	  if (is_gimple_debug (use_stmt))
+	    continue;
+
+          /* Check if we got back to the reduction phi.  */
+	  if (use_stmt == phi)
+            {
+	      loop_use_stmt = use_stmt;
+              found = true;
+              break;
+            }
+
+          if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
+            {
+	      loop_use_stmt = use_stmt;
+	      nloop_uses++;
+            }
+           else
+             n_out_of_loop_uses++;
+
+           /* There are can be either a single use in the loop or two uses in
+              phi nodes.  */
+           if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
+             return false;
+        }
+
+      if (found)
+        break;
+
+      /* We reached a statement with no loop uses.  */
+      if (nloop_uses == 0)
+	return false;
+
+      /* This is a loop exit phi, and we haven't reached the reduction phi.  */
+      if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
+        return false;
+
+      if (!is_gimple_assign (loop_use_stmt)
+	  || code != gimple_assign_rhs_code (loop_use_stmt)
+	  || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
+        return false;
+
+      /* Insert USE_STMT into reduction chain.  */
+      use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
+      reduc_chain.safe_push (use_stmt_info);
+
+      lhs = gimple_assign_lhs (loop_use_stmt);
+      size++;
+   }
+
+  if (!found || loop_use_stmt != phi || size < 2)
+    return false;
+
+  /* Swap the operands, if needed, to make the reduction operand be the second
+     operand.  */
+  lhs = PHI_RESULT (phi);
+  for (unsigned i = 0; i < reduc_chain.length (); ++i)
+    {
+      gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
+      if (gimple_assign_rhs2 (next_stmt) == lhs)
+	{
+	  tree op = gimple_assign_rhs1 (next_stmt);
+	  stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
+
+	  /* Check that the other def is either defined in the loop
+	     ("vect_internal_def"), or it's an induction (defined by a
+	     loop-header phi-node).  */
+	  if (def_stmt_info
+	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
+	      && parloops_valid_reduction_input_p (def_stmt_info))
+	    {
+	      lhs = gimple_assign_lhs (next_stmt);
+	      continue;
+	    }
+
+	  return false;
+	}
+      else
+	{
+          tree op = gimple_assign_rhs2 (next_stmt);
+	  stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
+
+          /* Check that the other def is either defined in the loop
+            ("vect_internal_def"), or it's an induction (defined by a
+            loop-header phi-node).  */
+	  if (def_stmt_info
+	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
+	      && parloops_valid_reduction_input_p (def_stmt_info))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
+				 next_stmt);
+
+	      swap_ssa_operands (next_stmt,
+				 gimple_assign_rhs1_ptr (next_stmt),
+                                 gimple_assign_rhs2_ptr (next_stmt));
+	      update_stmt (next_stmt);
+
+	      if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
+		LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
+	    }
+	  else
+	    return false;
+        }
+
+      lhs = gimple_assign_lhs (next_stmt);
+    }
+
+  /* Build up the actual chain.  */
+  for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
+    {
+      REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
+      REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
+    }
+  REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
+  REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
+
+  /* Save the chain for further analysis in SLP detection.  */
+  LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
+  REDUC_GROUP_SIZE (reduc_chain[0]) = size;
+
+  return true;
+}
+
+/* Return true if we need an in-order reduction for operation CODE
+   on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
+   overflow must wrap.  */
+
+static bool
+parloops_needs_fold_left_reduction_p (tree type, tree_code code,
+				      bool need_wrapping_integral_overflow)
+{
+  /* CHECKME: check for !flag_finite_math_only too?  */
+  if (SCALAR_FLOAT_TYPE_P (type))
+    switch (code)
+      {
+      case MIN_EXPR:
+      case MAX_EXPR:
+	return false;
+
+      default:
+	return !flag_associative_math;
+      }
+
+  if (INTEGRAL_TYPE_P (type))
+    {
+      if (!operation_no_trapping_overflow (type, code))
+	return true;
+      if (need_wrapping_integral_overflow
+	  && !TYPE_OVERFLOW_WRAPS (type)
+	  && operation_can_overflow (code))
+	return true;
+      return false;
+    }
+
+  if (SAT_FIXED_POINT_TYPE_P (type))
+    return true;
+
+  return false;
+}
+
+
+/* Function parloops_is_simple_reduction
+
+   (1) Detect a cross-iteration def-use cycle that represents a simple
+   reduction computation.  We look for the following pattern:
+
+   loop_header:
+     a1 = phi < a0, a2 >
+     a3 = ...
+     a2 = operation (a3, a1)
+
+   or
+
+   a3 = ...
+   loop_header:
+     a1 = phi < a0, a2 >
+     a2 = operation (a3, a1)
+
+   such that:
+   1. operation is commutative and associative and it is safe to
+      change the order of the computation
+   2. no uses for a2 in the loop (a2 is used out of the loop)
+   3. no uses of a1 in the loop besides the reduction operation
+   4. no uses of a1 outside the loop.
+
+   Conditions 1,4 are tested here.
+   Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
+
+   (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
+   nested cycles.
+
+   (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
+   reductions:
+
+     a1 = phi < a0, a2 >
+     inner loop (def of a3)
+     a2 = phi < a3 >
+
+   (4) Detect condition expressions, ie:
+     for (int i = 0; i < N; i++)
+       if (a[i] < val)
+	ret_val = a[i];
+
+*/
+
+static stmt_vec_info
+parloops_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
+			  bool *double_reduc,
+			  bool need_wrapping_integral_overflow,
+			  enum vect_reduction_type *v_reduc_type)
+{
+  gphi *phi = as_a <gphi *> (phi_info->stmt);
+  class loop *loop = (gimple_bb (phi))->loop_father;
+  class loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
+  bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
+  gimple *phi_use_stmt = NULL;
+  enum tree_code orig_code, code;
+  tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
+  tree type;
+  tree name;
+  imm_use_iterator imm_iter;
+  use_operand_p use_p;
+  bool phi_def;
+
+  *double_reduc = false;
+  *v_reduc_type = TREE_CODE_REDUCTION;
+
+  tree phi_name = PHI_RESULT (phi);
+  /* ???  If there are no uses of the PHI result the inner loop reduction
+     won't be detected as possibly double-reduction by vectorizable_reduction
+     because that tries to walk the PHI arg from the preheader edge which
+     can be constant.  See PR60382.  */
+  if (has_zero_uses (phi_name))
+    return NULL;
+  unsigned nphi_def_loop_uses = 0;
+  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
+    {
+      gimple *use_stmt = USE_STMT (use_p);
+      if (is_gimple_debug (use_stmt))
+	continue;
+
+      if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
+        {
+          if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "intermediate value used outside loop.\n");
+
+          return NULL;
+        }
+
+      nphi_def_loop_uses++;
+      phi_use_stmt = use_stmt;
+    }
+
+  edge latch_e = loop_latch_edge (loop);
+  tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
+  if (TREE_CODE (loop_arg) != SSA_NAME)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "reduction: not ssa_name: %T\n", loop_arg);
+      return NULL;
+    }
+
+  stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
+  if (!def_stmt_info
+      || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
+    return NULL;
+
+  if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
+    {
+      name = gimple_assign_lhs (def_stmt);
+      phi_def = false;
+    }
+  else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
+    {
+      name = PHI_RESULT (def_stmt);
+      phi_def = true;
+    }
+  else
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "reduction: unhandled reduction operation: %G",
+			 def_stmt_info->stmt);
+      return NULL;
+    }
+
+  unsigned nlatch_def_loop_uses = 0;
+  auto_vec<gphi *, 3> lcphis;
+  bool inner_loop_of_double_reduc = false;
+  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
+    {
+      gimple *use_stmt = USE_STMT (use_p);
+      if (is_gimple_debug (use_stmt))
+	continue;
+      if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
+	nlatch_def_loop_uses++;
+      else
+	{
+	  /* We can have more than one loop-closed PHI.  */
+	  lcphis.safe_push (as_a <gphi *> (use_stmt));
+	  if (nested_in_vect_loop
+	      && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
+		  == vect_double_reduction_def))
+	    inner_loop_of_double_reduc = true;
+	}
+    }
+
+  /* If this isn't a nested cycle or if the nested cycle reduction value
+     is used ouside of the inner loop we cannot handle uses of the reduction
+     value.  */
+  if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
+      && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "reduction used in loop.\n");
+      return NULL;
+    }
+
+  /* If DEF_STMT is a phi node itself, we expect it to have a single argument
+     defined in the inner loop.  */
+  if (phi_def)
+    {
+      gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
+      op1 = PHI_ARG_DEF (def_stmt, 0);
+
+      if (gimple_phi_num_args (def_stmt) != 1
+          || TREE_CODE (op1) != SSA_NAME)
+        {
+          if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "unsupported phi node definition.\n");
+
+          return NULL;
+        }
+
+      gimple *def1 = SSA_NAME_DEF_STMT (op1);
+      if (gimple_bb (def1)
+	  && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
+          && loop->inner
+          && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
+          && is_gimple_assign (def1)
+	  && is_a <gphi *> (phi_use_stmt)
+	  && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
+        {
+          if (dump_enabled_p ())
+            report_ploop_op (MSG_NOTE, def_stmt,
+			     "detected double reduction: ");
+
+          *double_reduc = true;
+	  return def_stmt_info;
+        }
+
+      return NULL;
+    }
+
+  /* If we are vectorizing an inner reduction we are executing that
+     in the original order only in case we are not dealing with a
+     double reduction.  */
+  bool check_reduction = true;
+  if (flow_loop_nested_p (vect_loop, loop))
+    {
+      gphi *lcphi;
+      unsigned i;
+      check_reduction = false;
+      FOR_EACH_VEC_ELT (lcphis, i, lcphi)
+	FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
+	  {
+	    gimple *use_stmt = USE_STMT (use_p);
+	    if (is_gimple_debug (use_stmt))
+	      continue;
+	    if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
+	      check_reduction = true;
+	  }
+    }
+
+  gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
+  code = orig_code = gimple_assign_rhs_code (def_stmt);
+
+  if (nested_in_vect_loop && !check_reduction)
+    {
+      /* FIXME: Even for non-reductions code generation is funneled
+	 through vectorizable_reduction for the stmt defining the
+	 PHI latch value.  So we have to artificially restrict ourselves
+	 for the supported operations.  */
+      switch (get_gimple_rhs_class (code))
+	{
+	case GIMPLE_BINARY_RHS:
+	case GIMPLE_TERNARY_RHS:
+	  break;
+	default:
+	  /* Not supported by vectorizable_reduction.  */
+	  if (dump_enabled_p ())
+	    report_ploop_op (MSG_MISSED_OPTIMIZATION, def_stmt,
+			     "nested cycle: not handled operation: ");
+	  return NULL;
+	}
+      if (dump_enabled_p ())
+	report_ploop_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
+      return def_stmt_info;
+    }
+
+  /* We can handle "res -= x[i]", which is non-associative by
+     simply rewriting this into "res += -x[i]".  Avoid changing
+     gimple instruction for the first simple tests and only do this
+     if we're allowed to change code at all.  */
+  if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
+    code = PLUS_EXPR;
+
+  if (code == COND_EXPR)
+    {
+      if (! nested_in_vect_loop)
+	*v_reduc_type = COND_REDUCTION;
+
+      op3 = gimple_assign_rhs1 (def_stmt);
+      if (COMPARISON_CLASS_P (op3))
+        {
+          op4 = TREE_OPERAND (op3, 1);
+          op3 = TREE_OPERAND (op3, 0);
+        }
+      if (op3 == phi_name || op4 == phi_name)
+	{
+	  if (dump_enabled_p ())
+	    report_ploop_op (MSG_MISSED_OPTIMIZATION, def_stmt,
+			     "reduction: condition depends on previous"
+			     " iteration: ");
+	  return NULL;
+	}
+
+      op1 = gimple_assign_rhs2 (def_stmt);
+      op2 = gimple_assign_rhs3 (def_stmt);
+    }
+  else if (!commutative_tree_code (code) || !associative_tree_code (code))
+    {
+      if (dump_enabled_p ())
+	report_ploop_op (MSG_MISSED_OPTIMIZATION, def_stmt,
+			 "reduction: not commutative/associative: ");
+      return NULL;
+    }
+  else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
+    {
+      op1 = gimple_assign_rhs1 (def_stmt);
+      op2 = gimple_assign_rhs2 (def_stmt);
+    }
+  else
+    {
+      if (dump_enabled_p ())
+	report_ploop_op (MSG_MISSED_OPTIMIZATION, def_stmt,
+			 "reduction: not handled operation: ");
+      return NULL;
+    }
+
+  if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
+    {
+      if (dump_enabled_p ())
+	report_ploop_op (MSG_MISSED_OPTIMIZATION, def_stmt,
+			 "reduction: both uses not ssa_names: ");
+
+      return NULL;
+    }
+
+  type = TREE_TYPE (gimple_assign_lhs (def_stmt));
+  if ((TREE_CODE (op1) == SSA_NAME
+       && !types_compatible_p (type,TREE_TYPE (op1)))
+      || (TREE_CODE (op2) == SSA_NAME
+          && !types_compatible_p (type, TREE_TYPE (op2)))
+      || (op3 && TREE_CODE (op3) == SSA_NAME
+          && !types_compatible_p (type, TREE_TYPE (op3)))
+      || (op4 && TREE_CODE (op4) == SSA_NAME
+          && !types_compatible_p (type, TREE_TYPE (op4))))
+    {
+      if (dump_enabled_p ())
+        {
+          dump_printf_loc (MSG_NOTE, vect_location,
+			   "reduction: multiple types: operation type: "
+			   "%T, operands types: %T,%T",
+			   type,  TREE_TYPE (op1), TREE_TYPE (op2));
+          if (op3)
+	    dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
+
+          if (op4)
+	    dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
+          dump_printf (MSG_NOTE, "\n");
+        }
+
+      return NULL;
+    }
+
+  /* Check whether it's ok to change the order of the computation.
+     Generally, when vectorizing a reduction we change the order of the
+     computation.  This may change the behavior of the program in some
+     cases, so we need to check that this is ok.  One exception is when
+     vectorizing an outer-loop: the inner-loop is executed sequentially,
+     and therefore vectorizing reductions in the inner-loop during
+     outer-loop vectorization is safe.  */
+  if (check_reduction
+      && *v_reduc_type == TREE_CODE_REDUCTION
+      && parloops_needs_fold_left_reduction_p (type, code,
+					       need_wrapping_integral_overflow))
+    *v_reduc_type = FOLD_LEFT_REDUCTION;
+
+  /* Reduction is safe. We're dealing with one of the following:
+     1) integer arithmetic and no trapv
+     2) floating point arithmetic, and special flags permit this optimization
+     3) nested cycle (i.e., outer loop vectorization).  */
+  stmt_vec_info def1_info = loop_info->lookup_def (op1);
+  stmt_vec_info def2_info = loop_info->lookup_def (op2);
+  if (code != COND_EXPR && !def1_info && !def2_info)
+    {
+      if (dump_enabled_p ())
+	report_ploop_op (MSG_NOTE, def_stmt,
+			 "reduction: no defs for operands: ");
+      return NULL;
+    }
+
+  /* Check that one def is the reduction def, defined by PHI,
+     the other def is either defined in the loop ("vect_internal_def"),
+     or it's an induction (defined by a loop-header phi-node).  */
+
+  if (def2_info
+      && def2_info->stmt == phi
+      && (code == COND_EXPR
+	  || !def1_info
+	  || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
+	  || parloops_valid_reduction_input_p (def1_info)))
+    {
+      if (dump_enabled_p ())
+	report_ploop_op (MSG_NOTE, def_stmt, "detected reduction: ");
+      return def_stmt_info;
+    }
+
+  if (def1_info
+      && def1_info->stmt == phi
+      && (code == COND_EXPR
+	  || !def2_info
+	  || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
+	  || parloops_valid_reduction_input_p (def2_info)))
+    {
+      if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
+	{
+	  /* Check if we can swap operands (just for simplicity - so that
+	     the rest of the code can assume that the reduction variable
+	     is always the last (second) argument).  */
+	  if (code == COND_EXPR)
+	    {
+	      /* Swap cond_expr by inverting the condition.  */
+	      tree cond_expr = gimple_assign_rhs1 (def_stmt);
+	      enum tree_code invert_code = ERROR_MARK;
+	      enum tree_code cond_code = TREE_CODE (cond_expr);
+
+	      if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
+		{
+		  bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
+		  invert_code = invert_tree_comparison (cond_code, honor_nans);
+		}
+	      if (invert_code != ERROR_MARK)
+		{
+		  TREE_SET_CODE (cond_expr, invert_code);
+		  swap_ssa_operands (def_stmt,
+				     gimple_assign_rhs2_ptr (def_stmt),
+				     gimple_assign_rhs3_ptr (def_stmt));
+		}
+	      else
+		{
+		  if (dump_enabled_p ())
+		    report_ploop_op (MSG_NOTE, def_stmt,
+				     "detected reduction: cannot swap operands "
+				     "for cond_expr");
+		  return NULL;
+		}
+	    }
+	  else
+	    swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
+			       gimple_assign_rhs2_ptr (def_stmt));
+
+	  if (dump_enabled_p ())
+	    report_ploop_op (MSG_NOTE, def_stmt,
+			     "detected reduction: need to swap operands: ");
+
+	  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
+	    LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
+        }
+      else
+        {
+          if (dump_enabled_p ())
+            report_ploop_op (MSG_NOTE, def_stmt, "detected reduction: ");
+        }
+
+      return def_stmt_info;
+    }
+
+  /* Try to find SLP reduction chain.  */
+  if (! nested_in_vect_loop
+      && code != COND_EXPR
+      && orig_code != MINUS_EXPR
+      && parloops_is_slp_reduction (loop_info, phi, def_stmt))
+    {
+      if (dump_enabled_p ())
+        report_ploop_op (MSG_NOTE, def_stmt,
+			 "reduction: detected reduction chain: ");
+
+      return def_stmt_info;
+    }
+
+  /* Look for the expression computing loop_arg from loop PHI result.  */
+  if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
+    return def_stmt_info;
+
+  if (dump_enabled_p ())
+    {
+      report_ploop_op (MSG_MISSED_OPTIMIZATION, def_stmt,
+		       "reduction: unknown pattern: ");
+    }
+
+  return NULL;
+}
+
+/* Wrapper around vect_is_simple_reduction, which will modify code
+   in-place if it enables detection of more reductions.  Arguments
+   as there.  */
+
+stmt_vec_info
+parloops_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
+			     bool *double_reduc,
+			     bool need_wrapping_integral_overflow)
+{
+  enum vect_reduction_type v_reduc_type;
+  stmt_vec_info def_info
+    = parloops_is_simple_reduction (loop_info, phi_info, double_reduc,
+				need_wrapping_integral_overflow,
+				&v_reduc_type);
+  if (def_info)
+    {
+      STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
+      STMT_VINFO_REDUC_DEF (phi_info) = def_info;
+      STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
+      STMT_VINFO_REDUC_DEF (def_info) = phi_info;
+    }
+  return def_info;
+}
+
 /* Minimal number of iterations of a loop that should be executed in each
    thread.  */
 #define MIN_PER_THREAD PARAM_VALUE (PARAM_PARLOOPS_MIN_PER_THREAD)
@@ -2614,9 +3326,9 @@ gather_scalar_reductions (loop_p loop, reduction_info_table_type *reduction_list
 	continue;
 
       stmt_vec_info reduc_stmt_info
-	= vect_force_simple_reduction (simple_loop_info,
-				       simple_loop_info->lookup_stmt (phi),
-				       &double_reduc, true);
+	= parloops_force_simple_reduction (simple_loop_info,
+					   simple_loop_info->lookup_stmt (phi),
+					   &double_reduc, true);
       if (!reduc_stmt_info || !valid_reduction_p (reduc_stmt_info))
 	continue;
 
@@ -2663,9 +3375,9 @@ gather_scalar_reductions (loop_p loop, reduction_info_table_type *reduction_list
 	      stmt_vec_info inner_phi_info
 		= simple_loop_info->lookup_stmt (inner_phi);
 	      stmt_vec_info inner_reduc_stmt_info
-		= vect_force_simple_reduction (simple_loop_info,
-					       inner_phi_info,
-					       &double_reduc, true);
+		= parloops_force_simple_reduction (simple_loop_info,
+						   inner_phi_info,
+						   &double_reduc, true);
 	      gcc_assert (!double_reduc);
 	      if (!inner_reduc_stmt_info
 		  || !valid_reduction_p (inner_reduc_stmt_info))
diff --git a/gcc/tree-ssa-alias.c b/gcc/tree-ssa-alias.c
index 01f095382d6..54e8adc8d7c 100644
--- a/gcc/tree-ssa-alias.c
+++ b/gcc/tree-ssa-alias.c
@@ -2535,13 +2535,36 @@ stmt_kills_ref_p (gimple *stmt, ao_ref *ref)
 	  case BUILT_IN_MEMSET_CHK:
 	  case BUILT_IN_STRNCPY:
 	  case BUILT_IN_STPNCPY:
+	  case BUILT_IN_CALLOC:
 	    {
 	      /* For a must-alias check we need to be able to constrain
 		 the access properly.  */
 	      if (!ref->max_size_known_p ())
 		return false;
-	      tree dest = gimple_call_arg (stmt, 0);
-	      tree len = gimple_call_arg (stmt, 2);
+	      tree dest;
+	      tree len;
+
+	      /* In execution order a calloc call will never kill
+		 anything.  However, DSE will (ab)use this interface
+		 to ask if a calloc call writes the same memory locations
+		 as a later assignment, memset, etc.  So handle calloc
+		 in the expected way.  */
+	      if (DECL_FUNCTION_CODE (callee) == BUILT_IN_CALLOC)
+		{
+		  tree arg0 = gimple_call_arg (stmt, 0);
+		  tree arg1 = gimple_call_arg (stmt, 1);
+		  if (TREE_CODE (arg0) != INTEGER_CST
+		      || TREE_CODE (arg1) != INTEGER_CST)
+		    return false;
+
+		  dest = gimple_call_lhs (stmt);
+		  len = fold_build2 (MULT_EXPR, TREE_TYPE (arg0), arg0, arg1);
+		}
+	      else
+		{
+		  dest = gimple_call_arg (stmt, 0);
+		  len = gimple_call_arg (stmt, 2);
+		}
 	      if (!poly_int_tree_p (len))
 		return false;
 	      tree rbase = ref->base;
diff --git a/gcc/tree-ssa-dse.c b/gcc/tree-ssa-dse.c
index efe5b31cc0a..c20fbe048ed 100644
--- a/gcc/tree-ssa-dse.c
+++ b/gcc/tree-ssa-dse.c
@@ -1,4 +1,4 @@
-/* Dead store elimination
+/* Dead and redundant store elimination
    Copyright (C) 2004-2019 Free Software Foundation, Inc.
 
 This file is part of GCC.
@@ -36,17 +36,26 @@ along with GCC; see the file COPYING3.  If not see
 #include "params.h"
 #include "alias.h"
 #include "tree-ssa-loop.h"
+#include "tree-ssa-dse.h"
 
 /* This file implements dead store elimination.
 
    A dead store is a store into a memory location which will later be
    overwritten by another store without any intervening loads.  In this
-   case the earlier store can be deleted.
+   case the earlier store can be deleted or trimmed if the store
+   was partially dead.
+
+   A redundant store is a store into a memory location which stores
+   the exact same value as a prior store to the same memory location.
+   While this can often be handled by dead store elimination, removing
+   the redundant store is often better than removing or trimming the
+   dead store.
 
    In our SSA + virtual operand world we use immediate uses of virtual
-   operands to detect dead stores.  If a store's virtual definition
+   operands to detect these cases.  If a store's virtual definition
    is used precisely once by a later store to the same location which
-   post dominates the first store, then the first store is dead.
+   post dominates the first store, then the first store is dead.  If
+   the data stored is the same, then the second store is redundant.
 
    The single use of the store's virtual definition ensures that
    there are no intervening aliased loads and the requirement that
@@ -58,7 +67,9 @@ along with GCC; see the file COPYING3.  If not see
    the point immediately before the later store.  Again, the single
    use of the virtual definition and the post-dominance relationship
    ensure that such movement would be safe.  Clearly if there are
-   back to back stores, then the second is redundant.
+   back to back stores, then the second is makes the first dead.  If
+   the second store stores the same value, then the second store is
+   redundant.
 
    Reviewing section 10.7.2 in Morgan's "Building an Optimizing Compiler"
    may also help in understanding this code since it discusses the
@@ -66,19 +77,13 @@ along with GCC; see the file COPYING3.  If not see
    fact, they are the same transformation applied to different views of
    the CFG.  */
 
+void delete_dead_or_redundant_assignment (gimple_stmt_iterator *, const char *);
+static void delete_dead_or_redundant_call (gimple_stmt_iterator *, const char *);
 
 /* Bitmap of blocks that have had EH statements cleaned.  We should
    remove their dead edges eventually.  */
 static bitmap need_eh_cleanup;
 
-/* Return value from dse_classify_store */
-enum dse_store_status
-{
-  DSE_STORE_LIVE,
-  DSE_STORE_MAYBE_PARTIAL_DEAD,
-  DSE_STORE_DEAD
-};
-
 /* STMT is a statement that may write into memory.  Analyze it and
    initialize WRITE to describe how STMT affects memory.
 
@@ -106,6 +111,25 @@ initialize_ao_ref_for_dse (gimple *stmt, ao_ref *write)
 	      ao_ref_init_from_ptr_and_size (write, ptr, size);
 	      return true;
 	    }
+
+	  /* A calloc call can never be dead, but it can make
+	     subsequent stores redundant if they store 0 into
+	     the same memory locations.  */
+	  case BUILT_IN_CALLOC:
+	    {
+	      tree nelem = gimple_call_arg (stmt, 0);
+	      tree selem = gimple_call_arg (stmt, 1);
+	      if (TREE_CODE (nelem) == INTEGER_CST
+		  && TREE_CODE (selem) == INTEGER_CST)
+		{
+		  tree lhs = gimple_call_lhs (stmt);
+		  tree size = fold_build2 (MULT_EXPR, TREE_TYPE (nelem),
+					   nelem, selem);
+		  ao_ref_init_from_ptr_and_size (write, lhs, size);
+		  return true;
+		}
+	    }
+
 	  default:
 	    break;
 	}
@@ -551,16 +575,84 @@ check_name (tree, tree *idx, void *data)
   return true;
 }
 
+/* STMT stores the value 0 into one or more memory locations
+   (via memset, empty constructor, calloc call, etc).
+
+   See if there is a subsequent store of the value 0 to one
+   or more of the same memory location(s).  If so, the subsequent
+   store is redundant and can be removed.
+
+   The subsequent stores could be via memset, empty constructors,
+   simple MEM stores, etc.  */
+
+static void
+dse_optimize_redundant_stores (gimple *stmt)
+{
+  int cnt = 0;
+
+  /* We could do something fairly complex and look through PHIs
+     like DSE_CLASSIFY_STORE, but it doesn't seem to be worth
+     the effort.
+
+     Look at all the immediate uses of the VDEF (which are obviously
+     dominated by STMT).   See if one or more stores 0 into the same
+     memory locations a STMT, if so remove the immediate use statements.  */
+  tree defvar = gimple_vdef (stmt);
+  imm_use_iterator ui;
+  gimple *use_stmt;
+  FOR_EACH_IMM_USE_STMT (use_stmt, ui, defvar)
+    {
+      /* Limit stmt walking.  */
+      if (++cnt > PARAM_VALUE (PARAM_DSE_MAX_ALIAS_QUERIES_PER_STORE))
+	BREAK_FROM_IMM_USE_STMT (ui);
+
+      /* If USE_STMT stores 0 into one or more of the same locations
+	 as STMT and STMT would kill USE_STMT, then we can just remove
+	 USE_STMT.  */
+      tree fndecl;
+      if ((is_gimple_assign (use_stmt)
+	   && gimple_vdef (use_stmt)
+	   && ((gimple_assign_rhs_code (use_stmt) == CONSTRUCTOR
+	        && CONSTRUCTOR_NELTS (gimple_assign_rhs1 (use_stmt)) == 0
+	        && !gimple_clobber_p (stmt))
+	       || (gimple_assign_rhs_code (use_stmt) == INTEGER_CST
+		   && integer_zerop (gimple_assign_rhs1 (use_stmt)))))
+	  || (gimple_call_builtin_p (use_stmt, BUILT_IN_NORMAL)
+	      && (fndecl = gimple_call_fndecl (use_stmt)) != NULL
+	      && (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_MEMSET
+		  || DECL_FUNCTION_CODE (fndecl) == BUILT_IN_MEMSET_CHK)
+	      && integer_zerop (gimple_call_arg (use_stmt, 1))))
+	{
+	  ao_ref write;
+
+	  if (!initialize_ao_ref_for_dse (use_stmt, &write))
+	    BREAK_FROM_IMM_USE_STMT (ui)
+
+	  if (valid_ao_ref_for_dse (&write)
+	      && stmt_kills_ref_p (stmt, &write))
+	    {
+	      gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
+	      if (is_gimple_assign (use_stmt))
+		delete_dead_or_redundant_assignment (&gsi, "redundant");
+	      else if (is_gimple_call (use_stmt))
+		delete_dead_or_redundant_call (&gsi, "redundant");
+	      else
+		gcc_unreachable ();
+	    }
+	}
+    }
+}
+
 /* A helper of dse_optimize_stmt.
    Given a GIMPLE_ASSIGN in STMT that writes to REF, classify it
    according to downstream uses and defs.  Sets *BY_CLOBBER_P to true
    if only clobber statements influenced the classification result.
    Returns the classification.  */
 
-static dse_store_status
+dse_store_status
 dse_classify_store (ao_ref *ref, gimple *stmt,
 		    bool byte_tracking_enabled, sbitmap live_bytes,
-		    bool *by_clobber_p = NULL)
+		    bool *by_clobber_p, tree stop_at_vuse)
 {
   gimple *temp;
   int cnt = 0;
@@ -596,6 +688,11 @@ dse_classify_store (ao_ref *ref, gimple *stmt,
 	}
       else
 	defvar = gimple_vdef (temp);
+
+      /* If we're instructed to stop walking at region boundary, do so.  */
+      if (defvar == stop_at_vuse)
+	return DSE_STORE_LIVE;
+
       auto_vec<gimple *, 10> defs;
       gimple *phi_def = NULL;
       FOR_EACH_IMM_USE_STMT (use_stmt, ui, defvar)
@@ -763,12 +860,12 @@ private:
 
 /* Delete a dead call at GSI, which is mem* call of some kind.  */
 static void
-delete_dead_call (gimple_stmt_iterator *gsi)
+delete_dead_or_redundant_call (gimple_stmt_iterator *gsi, const char *type)
 {
   gimple *stmt = gsi_stmt (*gsi);
   if (dump_file && (dump_flags & TDF_DETAILS))
     {
-      fprintf (dump_file, "  Deleted dead call: ");
+      fprintf (dump_file, "  Deleted %s call: ", type);
       print_gimple_stmt (dump_file, stmt, 0, dump_flags);
       fprintf (dump_file, "\n");
     }
@@ -796,13 +893,13 @@ delete_dead_call (gimple_stmt_iterator *gsi)
 
 /* Delete a dead store at GSI, which is a gimple assignment. */
 
-static void
-delete_dead_assignment (gimple_stmt_iterator *gsi)
+void
+delete_dead_or_redundant_assignment (gimple_stmt_iterator *gsi, const char *type)
 {
   gimple *stmt = gsi_stmt (*gsi);
   if (dump_file && (dump_flags & TDF_DETAILS))
     {
-      fprintf (dump_file, "  Deleted dead store: ");
+      fprintf (dump_file, "  Deleted %s store: ", type);
       print_gimple_stmt (dump_file, stmt, 0, dump_flags);
       fprintf (dump_file, "\n");
     }
@@ -855,7 +952,8 @@ dse_dom_walker::dse_optimize_stmt (gimple_stmt_iterator *gsi)
      some builtin calls.  */
   if (gimple_call_builtin_p (stmt, BUILT_IN_NORMAL))
     {
-      switch (DECL_FUNCTION_CODE (gimple_call_fndecl (stmt)))
+      tree fndecl = gimple_call_fndecl (stmt);
+      switch (DECL_FUNCTION_CODE (fndecl))
 	{
 	  case BUILT_IN_MEMCPY:
 	  case BUILT_IN_MEMMOVE:
@@ -867,10 +965,18 @@ dse_dom_walker::dse_optimize_stmt (gimple_stmt_iterator *gsi)
 	      tree size = gimple_call_arg (stmt, 2);
 	      if (integer_zerop (size))
 		{
-		  delete_dead_call (gsi);
+		  delete_dead_or_redundant_call (gsi, "dead");
 		  return;
 		}
 
+	      /* If this is a memset call that initializes an object
+		 to zero, it may be redundant with an earlier memset
+		 or empty CONSTRUCTOR of a larger object.  */
+	      if ((DECL_FUNCTION_CODE (fndecl) == BUILT_IN_MEMSET
+		   || DECL_FUNCTION_CODE (fndecl) == BUILT_IN_MEMSET_CHK)
+		  && integer_zerop (gimple_call_arg (stmt, 1)))
+		dse_optimize_redundant_stores (stmt);
+
 	      enum dse_store_status store_status;
 	      m_byte_tracking_enabled
 		= setup_live_bytes_from_ref (&ref, m_live_bytes);
@@ -887,10 +993,14 @@ dse_dom_walker::dse_optimize_stmt (gimple_stmt_iterator *gsi)
 		}
 
 	      if (store_status == DSE_STORE_DEAD)
-		delete_dead_call (gsi);
+		delete_dead_or_redundant_call (gsi, "dead");
 	      return;
 	    }
 
+	  case BUILT_IN_CALLOC:
+	    /* We already know the arguments are integer constants.  */
+	    dse_optimize_redundant_stores (stmt);
+
 	  default:
 	    return;
 	}
@@ -900,6 +1010,18 @@ dse_dom_walker::dse_optimize_stmt (gimple_stmt_iterator *gsi)
     {
       bool by_clobber_p = false;
 
+      /* First see if this store is a CONSTRUCTOR and if there
+	 are subsequent CONSTRUCTOR stores which are totally
+	 subsumed by this statement.  If so remove the subsequent
+	 CONSTRUCTOR store.
+
+	 This will tend to make fewer calls into memset with longer
+	 arguments.  */
+      if (gimple_assign_rhs_code (stmt) == CONSTRUCTOR
+	  && CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt)) == 0
+	  && !gimple_clobber_p (stmt))
+	dse_optimize_redundant_stores (stmt);
+
       /* Self-assignments are zombies.  */
       if (operand_equal_p (gimple_assign_rhs1 (stmt),
 			   gimple_assign_lhs (stmt), 0))
@@ -930,7 +1052,7 @@ dse_dom_walker::dse_optimize_stmt (gimple_stmt_iterator *gsi)
 	  && !by_clobber_p)
 	return;
 
-      delete_dead_assignment (gsi);
+      delete_dead_or_redundant_assignment (gsi, "dead");
     }
 }
 
diff --git a/gcc/tree-ssa-dse.h b/gcc/tree-ssa-dse.h
new file mode 100644
index 00000000000..a5eccbd746d
--- /dev/null
+++ b/gcc/tree-ssa-dse.h
@@ -0,0 +1,36 @@
+/* Support routines for dead store elimination. 
+   Copyright (C) 2019 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_TREE_SSA_DSE_H
+#define GCC_TREE_SSA_DSE_H
+
+/* Return value from dse_classify_store */
+enum dse_store_status
+{
+  DSE_STORE_LIVE,
+  DSE_STORE_MAYBE_PARTIAL_DEAD,
+  DSE_STORE_DEAD
+};
+
+dse_store_status dse_classify_store (ao_ref *, gimple *, bool, sbitmap,
+				     bool * = NULL, tree = NULL);
+
+void delete_dead_or_redundant_assignment (gimple_stmt_iterator *, const char *);
+
+#endif   /* GCC_TREE_SSA_DSE_H  */
diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c
index d241becd481..2d54e13b180 100644
--- a/gcc/tree-ssa-loop-niter.c
+++ b/gcc/tree-ssa-loop-niter.c
@@ -1928,7 +1928,7 @@ number_of_iterations_cond (struct loop *loop,
 
 tree
 simplify_replace_tree (tree expr, tree old, tree new_tree,
-		       tree (*valueize) (tree))
+		       tree (*valueize) (tree, void*), void *context)
 {
   unsigned i, n;
   tree ret = NULL_TREE, e, se;
@@ -1944,7 +1944,7 @@ simplify_replace_tree (tree expr, tree old, tree new_tree,
     {
       if (TREE_CODE (expr) == SSA_NAME)
 	{
-	  new_tree = valueize (expr);
+	  new_tree = valueize (expr, context);
 	  if (new_tree != expr)
 	    return new_tree;
 	}
@@ -1960,7 +1960,7 @@ simplify_replace_tree (tree expr, tree old, tree new_tree,
   for (i = 0; i < n; i++)
     {
       e = TREE_OPERAND (expr, i);
-      se = simplify_replace_tree (e, old, new_tree, valueize);
+      se = simplify_replace_tree (e, old, new_tree, valueize, context);
       if (e == se)
 	continue;
 
diff --git a/gcc/tree-ssa-loop-niter.h b/gcc/tree-ssa-loop-niter.h
index dc116489218..fb192d2c250 100644
--- a/gcc/tree-ssa-loop-niter.h
+++ b/gcc/tree-ssa-loop-niter.h
@@ -53,7 +53,9 @@ extern bool scev_probably_wraps_p (tree, tree, tree, gimple *,
 				   struct loop *, bool);
 extern void free_numbers_of_iterations_estimates (struct loop *);
 extern void free_numbers_of_iterations_estimates (function *);
-extern tree simplify_replace_tree (tree, tree, tree, tree (*)(tree) = NULL);
+extern tree simplify_replace_tree (tree, tree,
+				   tree, tree (*)(tree, void *) = NULL,
+				   void * = NULL);
 extern void substitute_in_loop_info (struct loop *, tree, tree);
 
 #endif /* GCC_TREE_SSA_LOOP_NITER_H */
diff --git a/gcc/tree-ssa-loop.c b/gcc/tree-ssa-loop.c
index 00a09508836..551718637f1 100644
--- a/gcc/tree-ssa-loop.c
+++ b/gcc/tree-ssa-loop.c
@@ -768,9 +768,9 @@ get_lsm_tmp_name (tree ref, unsigned n, const char *suffix)
       ns[1] = 0;
       lsm_tmp_name_add (ns);
     }
-  return lsm_tmp_name;
   if (suffix != NULL)
     lsm_tmp_name_add (suffix);
+  return lsm_tmp_name;
 }
 
 /* Computes an estimated number of insns in LOOP, weighted by WEIGHTS.  */
diff --git a/gcc/tree-ssa-reassoc.c b/gcc/tree-ssa-reassoc.c
index 6794fbde29e..9c1a9a651fe 100644
--- a/gcc/tree-ssa-reassoc.c
+++ b/gcc/tree-ssa-reassoc.c
@@ -2039,9 +2039,6 @@ optimize_ops_list (enum tree_code opcode,
       i++;
     }
 
-  length = ops->length ();
-  oelast = ops->last ();
-
   if (iterate)
     optimize_ops_list (opcode, ops);
 }
diff --git a/gcc/tree-ssa-sccvn.c b/gcc/tree-ssa-sccvn.c
index 95fbead7b1e..cd5a3a75eaa 100644
--- a/gcc/tree-ssa-sccvn.c
+++ b/gcc/tree-ssa-sccvn.c
@@ -309,6 +309,10 @@ static vn_tables_t valid_info;
 /* Valueization hook.  Valueize NAME if it is an SSA name, otherwise
    just return it.  */
 tree (*vn_valueize) (tree);
+tree vn_valueize_wrapper (tree t, void* context ATTRIBUTE_UNUSED)
+{
+  return vn_valueize (t);
+}
 
 
 /* This represents the top of the VN lattice, which is the universal
@@ -6364,7 +6368,7 @@ process_bb (rpo_elim &avail, basic_block bb,
       if (bb->loop_father->nb_iterations)
 	bb->loop_father->nb_iterations
 	  = simplify_replace_tree (bb->loop_father->nb_iterations,
-				   NULL_TREE, NULL_TREE, vn_valueize);
+				   NULL_TREE, NULL_TREE, &vn_valueize_wrapper);
     }
 
   /* Value-number all defs in the basic-block.  */
diff --git a/gcc/tree-ssa-sink.c b/gcc/tree-ssa-sink.c
index 2648b24f7d5..98b6caced03 100644
--- a/gcc/tree-ssa-sink.c
+++ b/gcc/tree-ssa-sink.c
@@ -433,7 +433,6 @@ statement_sink_location (gimple *stmt, basic_block frombb,
 
       if (gimple_code (use) != GIMPLE_PHI)
 	{
-	  sinkbb = gimple_bb (use);
 	  sinkbb = select_best_block (frombb, gimple_bb (use), stmt);
 
 	  if (sinkbb == frombb)
diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
index c3ea2d680d8..91494d76176 100644
--- a/gcc/tree-ssa-threadedge.c
+++ b/gcc/tree-ssa-threadedge.c
@@ -1299,7 +1299,6 @@ thread_across_edge (gcond *dummy_cond,
 
         x = new jump_thread_edge (taken_edge, EDGE_COPY_SRC_JOINER_BLOCK);
 	path->safe_push (x);
-	found = false;
 	found = thread_around_empty_blocks (taken_edge,
 					    dummy_cond,
 					    avail_exprs_stack,
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 17a4fc8e279..e822ffc1a01 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -2863,10 +2863,12 @@ strip_conversion (tree op)
 }
 
 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
-   and STMT2_INFO being in a single group.  */
+   and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
+   be grouped in SLP mode.  */
 
 static bool
-can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
+can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
+		   bool allow_slp_p)
 {
   if (gimple_assign_single_p (stmt1_info->stmt))
     return gimple_assign_single_p (stmt2_info->stmt);
@@ -2888,7 +2890,8 @@ can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
 	 like those created by build_mask_conversion.  */
       tree mask1 = gimple_call_arg (call1, 2);
       tree mask2 = gimple_call_arg (call2, 2);
-      if (!operand_equal_p (mask1, mask2, 0))
+      if (!operand_equal_p (mask1, mask2, 0)
+          && (ifn == IFN_MASK_STORE || !allow_slp_p))
 	{
 	  mask1 = strip_conversion (mask1);
 	  if (!mask1)
@@ -2974,7 +2977,7 @@ vect_analyze_data_ref_accesses (vec_info *vinfo)
 	      || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
 					DR_BASE_ADDRESS (drb)) != 0
 	      || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
-	      || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
+	      || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
 	    break;
 
 	  /* Check that the data-refs have the same constant size.  */
@@ -3059,6 +3062,13 @@ vect_analyze_data_ref_accesses (vec_info *vinfo)
 	  DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
 	  lastinfo = stmtinfo_b;
 
+	  STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
+	    = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
+
+	  if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "Load suitable for SLP vectorization only.\n");
+
 	  if (init_b == init_prev
 	      && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
 	      && dump_enabled_p ())
@@ -3446,7 +3456,6 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
   /* First, we collect all data ref pairs for aliasing checks.  */
   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
     {
-      int comp_res;
       poly_uint64 lower_bound;
       tree segment_length_a, segment_length_b;
       unsigned HOST_WIDE_INT access_size_a, access_size_b;
@@ -3478,10 +3487,13 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
       stmt_vec_info stmt_info_b = dr_info_b->stmt;
 
+      bool preserves_scalar_order_p
+	= vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
+
       /* Skip the pair if inter-iteration dependencies are irrelevant
 	 and intra-iteration dependencies are guaranteed to be honored.  */
       if (ignore_step_p
-	  && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
+	  && (preserves_scalar_order_p
 	      || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
 						 &lower_bound)))
 	{
@@ -3562,14 +3574,11 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
       align_a = vect_vfa_align (dr_info_a);
       align_b = vect_vfa_align (dr_info_b);
 
-      comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
-					DR_BASE_ADDRESS (dr_info_b->dr));
-      if (comp_res == 0)
-	comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
-					  DR_OFFSET (dr_info_b->dr));
-
       /* See whether the alias is known at compilation time.  */
-      if (comp_res == 0
+      if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
+			   DR_BASE_ADDRESS (dr_info_b->dr), 0)
+	  && operand_equal_p (DR_OFFSET (dr_info_a->dr),
+			      DR_OFFSET (dr_info_b->dr), 0)
 	  && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
 	  && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
 	  && poly_int_tree_p (segment_length_a)
@@ -3602,15 +3611,21 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
 					   stmt_info_b->stmt);
 	}
 
-      dr_with_seg_len_pair_t dr_with_seg_len_pair
-	(dr_with_seg_len (dr_info_a->dr, segment_length_a,
-			  access_size_a, align_a),
-	 dr_with_seg_len (dr_info_b->dr, segment_length_b,
-			  access_size_b, align_b));
+      dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
+			    access_size_a, align_a);
+      dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
+			    access_size_b, align_b);
+      /* Canonicalize the order to be the one that's needed for accurate
+	 RAW, WAR and WAW flags, in cases where the data references are
+	 well-ordered.  The order doesn't really matter otherwise,
+	 but we might as well be consistent.  */
+      if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
+	std::swap (dr_a, dr_b);
 
-      /* Canonicalize pairs by sorting the two DR members.  */
-      if (comp_res > 0)
-	std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
+      dr_with_seg_len_pair_t dr_with_seg_len_pair
+	(dr_a, dr_b, (preserves_scalar_order_p
+		      ? dr_with_seg_len_pair_t::WELL_ORDERED
+		      : dr_with_seg_len_pair_t::REORDERED));
 
       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
     }
@@ -4123,7 +4138,7 @@ vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
 */
 
 opt_result
-vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
+vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
 {
   struct loop *loop = NULL;
   unsigned int i;
@@ -4298,7 +4313,7 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
       /* Set vectype for STMT.  */
       scalar_type = TREE_TYPE (DR_REF (dr));
       STMT_VINFO_VECTYPE (stmt_info)
-	= get_vectype_for_scalar_type (scalar_type);
+	= get_vectype_for_scalar_type (vinfo, scalar_type);
       if (!STMT_VINFO_VECTYPE (stmt_info))
         {
           if (dump_enabled_p ())
@@ -4344,13 +4359,18 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
 	  if (!vect_check_gather_scatter (stmt_info,
 					  as_a <loop_vec_info> (vinfo),
 					  &gs_info)
-	      || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
-	    return opt_result::failure_at
-	      (stmt_info->stmt,
-	       (gatherscatter == GATHER) ?
-	       "not vectorized: not suitable for gather load %G" :
-	       "not vectorized: not suitable for scatter store %G",
-	       stmt_info->stmt);
+	      || !get_vectype_for_scalar_type (vinfo,
+					       TREE_TYPE (gs_info.offset)))
+	    {
+	      if (fatal)
+		*fatal = false;
+	      return opt_result::failure_at
+			(stmt_info->stmt,
+			 (gatherscatter == GATHER)
+			 ? "not vectorized: not suitable for gather load %G"
+			 : "not vectorized: not suitable for scatter store %G",
+			 stmt_info->stmt);
+	    }
 	  STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
 	}
     }
diff --git a/gcc/tree-vect-generic.c b/gcc/tree-vect-generic.c
index ad1ea4e7b97..39bc2a82b37 100644
--- a/gcc/tree-vect-generic.c
+++ b/gcc/tree-vect-generic.c
@@ -694,7 +694,7 @@ expand_vector_divmod (gimple_stmt_iterator *gsi, tree type, tree op0,
 	      tree zero, cst, cond, mask_type;
 	      gimple *stmt;
 
-	      mask_type = build_same_sized_truth_vector_type (type);
+	      mask_type = truth_type_for (type);
 	      zero = build_zero_cst (type);
 	      cond = build2 (LT_EXPR, mask_type, op0, zero);
 	      tree_vector_builder vec (type, nunits, 1);
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index b3fae5ba4da..20ede85633b 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -47,6 +47,9 @@ along with GCC; see the file COPYING3.  If not see
 #include "stor-layout.h"
 #include "optabs-query.h"
 #include "vec-perm-indices.h"
+#include "insn-config.h"
+#include "rtl.h"
+#include "recog.h"
 
 /*************************************************************************
   Simple Loop Peeling Utilities
@@ -323,13 +326,18 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
   tree src_masktype = src_rgm->mask_type;
   tree dest_masktype = dest_rgm->mask_type;
   machine_mode src_mode = TYPE_MODE (src_masktype);
+  insn_code icode1, icode2;
   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
-      && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
-      && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
+      && (icode1 = optab_handler (vec_unpacku_hi_optab,
+				  src_mode)) != CODE_FOR_nothing
+      && (icode2 = optab_handler (vec_unpacku_lo_optab,
+				  src_mode)) != CODE_FOR_nothing)
     {
       /* Unpacking the source masks gives at least as many mask bits as
 	 we need.  We can then VIEW_CONVERT any excess bits away.  */
-      tree unpack_masktype = vect_halve_mask_nunits (src_masktype);
+      machine_mode dest_mode = insn_data[icode1].operand[0].mode;
+      gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
+      tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 	{
 	  tree src = src_rgm->masks[i / 2];
@@ -1745,7 +1753,7 @@ vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
    CODE and NITERS are as for vect_update_inits_of_dr.  */
 
-static void
+void
 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
 			  tree_code code)
 {
@@ -1755,21 +1763,12 @@ vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
 
   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
 
-  /* Adjust niters to sizetype and insert stmts on loop preheader edge.  */
+  /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
+     here, but since we might use these niters to update the epilogues niters
+     and data references we can't insert them here as this definition might not
+     always dominate its uses.  */
   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
-    {
-      gimple_seq seq;
-      edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
-      tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
-
-      niters = fold_convert (sizetype, niters);
-      niters = force_gimple_operand (niters, &seq, false, var);
-      if (seq)
-	{
-	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
-	  gcc_assert (!new_bb);
-	}
-    }
+    niters = fold_convert (sizetype, niters);
 
   FOR_EACH_VEC_ELT (datarefs, i, dr)
     {
@@ -2032,6 +2031,29 @@ vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
 }
 
+/* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
+   this function searches for the corresponding lcssa phi node in exit
+   bb of LOOP.  If it is found, return the phi result; otherwise return
+   NULL.  */
+
+static tree
+find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
+		gphi *lcssa_phi)
+{
+  gphi_iterator gsi;
+  edge e = single_exit (loop);
+
+  gcc_assert (single_pred_p (e->dest));
+  for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gphi *phi = gsi.phi ();
+      if (operand_equal_p (PHI_ARG_DEF (phi, 0),
+			   PHI_ARG_DEF (lcssa_phi, 0), 0))
+	return PHI_RESULT (phi);
+    }
+  return NULL_TREE;
+}
+
 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
    from SECOND/FIRST and puts it at the original loop's preheader/exit
    edge, the two loops are arranged as below:
@@ -2119,6 +2141,29 @@ slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
 	 incoming edge.  */
       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
     }
+
+  /* For epilogue peeling we have to make sure to copy all LC PHIs
+     for correct vectorization of live stmts.  */
+  if (loop == first)
+    {
+      basic_block orig_exit = single_exit (second)->dest;
+      for (gsi_orig = gsi_start_phis (orig_exit);
+	   !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
+	{
+	  gphi *orig_phi = gsi_orig.phi ();
+	  tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
+	  if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
+	    continue;
+
+	  /* Already created in the above loop.   */
+	  if (find_guard_arg (first, second, orig_phi))
+	    continue;
+
+	  tree new_res = copy_ssa_name (orig_arg);
+	  gphi *lcphi = create_phi_node (new_res, between_bb);
+	  add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
+	}
+    }
 }
 
 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
@@ -2203,29 +2248,6 @@ slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop,
     }
 }
 
-/* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
-   this function searches for the corresponding lcssa phi node in exit
-   bb of LOOP.  If it is found, return the phi result; otherwise return
-   NULL.  */
-
-static tree
-find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED,
-		gphi *lcssa_phi)
-{
-  gphi_iterator gsi;
-  edge e = single_exit (loop);
-
-  gcc_assert (single_pred_p (e->dest));
-  for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
-    {
-      gphi *phi = gsi.phi ();
-      if (operand_equal_p (PHI_ARG_DEF (phi, 0),
-			   PHI_ARG_DEF (lcssa_phi, 0), 0))
-	return PHI_RESULT (phi);
-    }
-  return NULL_TREE;
-}
-
 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
@@ -2296,12 +2318,14 @@ slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog,
     {
       gphi *update_phi = gsi.phi ();
       tree old_arg = PHI_ARG_DEF (update_phi, 0);
-      /* This loop-closed-phi actually doesn't represent a use out of the
-	 loop - the phi arg is a constant.  */
-      if (TREE_CODE (old_arg) != SSA_NAME)
-	continue;
 
-      tree merge_arg = get_current_def (old_arg);
+      tree merge_arg = NULL_TREE;
+
+      /* If the old argument is a SSA_NAME use its current_def.  */
+      if (TREE_CODE (old_arg) == SSA_NAME)
+	merge_arg = get_current_def (old_arg);
+      /* If it's a constant or doesn't have a current_def, just use the old
+	 argument.  */
       if (!merge_arg)
 	merge_arg = old_arg;
 
@@ -2412,7 +2436,22 @@ slpeel_update_phi_nodes_for_lcssa (struct loop *epilog)
 
    Note this function peels prolog and epilog only if it's necessary,
    as well as guards.
-   Returns created epilogue or NULL.
+   This function returns the epilogue loop if a decision was made to vectorize
+   it, otherwise NULL.
+
+   The analysis resulting in this epilogue loop's loop_vec_info was performed
+   in the same vect_analyze_loop call as the main loop's.  At that time
+   vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
+   vectorization factors than the main loop.  This list is stored in the main
+   loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
+   vectorize the epilogue loop for a lower vectorization factor,  the
+   loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
+   updated and linked to the epilogue loop.  This is later used to vectorize
+   the epilogue.  The reason the loop_vec_info needs updating is that it was
+   constructed based on the original main loop, and the epilogue loop is a
+   copy of this loop, so all links pointing to statements in the original loop
+   need updating.  Furthermore, these loop_vec_infos share the
+   data_reference's records, which will also need to be updated.
 
    TODO: Guard for prefer_scalar_loop should be emitted along with
    versioning conditions if loop versioning is needed.  */
@@ -2422,7 +2461,8 @@ struct loop *
 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 		 tree *niters_vector, tree *step_vector,
 		 tree *niters_vector_mult_vf_var, int th,
-		 bool check_profitability, bool niters_no_overflow)
+		 bool check_profitability, bool niters_no_overflow,
+		 tree *advance, drs_init_vec &orig_drs_init)
 {
   edge e, guard_e;
   tree type = TREE_TYPE (niters), guard_cond;
@@ -2430,6 +2470,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
   profile_probability prob_prolog, prob_vector, prob_epilog;
   int estimated_vf;
   int prolog_peeling = 0;
+  bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
   /* We currently do not support prolog peeling if the target alignment is not
      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
      target alignment being constant.  */
@@ -2483,19 +2524,77 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
   int bound_prolog = 0;
   if (prolog_peeling)
     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
-						 &bound_prolog);
+						  &bound_prolog);
   else
     niters_prolog = build_int_cst (type, 0);
 
+  loop_vec_info epilogue_vinfo = NULL;
+  if (vect_epilogues)
+    {
+      epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
+      loop_vinfo->epilogue_vinfos.ordered_remove (0);
+    }
+
+  tree niters_vector_mult_vf = NULL_TREE;
+  /* Saving NITERs before the loop, as this may be changed by prologue.  */
+  tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
+  edge update_e = NULL, skip_e = NULL;
+  unsigned int lowest_vf = constant_lower_bound (vf);
+  /* If we know the number of scalar iterations for the main loop we should
+     check whether after the main loop there are enough iterations left over
+     for the epilogue.  */
+  if (vect_epilogues
+      && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && prolog_peeling >= 0
+      && known_eq (vf, lowest_vf))
+    {
+      unsigned HOST_WIDE_INT eiters
+	= (LOOP_VINFO_INT_NITERS (loop_vinfo)
+	   - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
+
+      eiters -= prolog_peeling;
+      eiters
+	= eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
+
+      unsigned int ratio;
+      unsigned int epilogue_gaps
+	= LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
+      while (!(constant_multiple_p
+	       (GET_MODE_SIZE (loop_vinfo->vector_mode),
+		GET_MODE_SIZE (epilogue_vinfo->vector_mode), &ratio)
+	       && eiters >= lowest_vf / ratio + epilogue_gaps))
+	{
+	  delete epilogue_vinfo;
+	  epilogue_vinfo = NULL;
+	  if (loop_vinfo->epilogue_vinfos.length () == 0)
+	    {
+	      vect_epilogues = false;
+	      break;
+	    }
+	  epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
+	  loop_vinfo->epilogue_vinfos.ordered_remove (0);
+	  epilogue_gaps = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
+	}
+    }
   /* Prolog loop may be skipped.  */
   bool skip_prolog = (prolog_peeling != 0);
-  /* Skip to epilog if scalar loop may be preferred.  It's only needed
-     when we peel for epilog loop and when it hasn't been checked with
-     loop versioning.  */
+  /* Skip this loop to epilog when there are not enough iterations to enter this
+     vectorized loop.  If true we should perform runtime checks on the NITERS
+     to check whether we should skip the current vectorized loop.  If we know
+     the number of scalar iterations we may choose to add a runtime check if
+     this number "maybe" smaller than the number of iterations required
+     when we know the number of scalar iterations may potentially
+     be smaller than the number of iterations required to enter this loop, for
+     this we use the upper bounds on the prolog and epilog peeling.  When we
+     don't know the number of iterations and don't require versioning it is
+     because we have asserted that there are enough scalar iterations to enter
+     the main loop, so this skip is not necessary.  When we are versioning then
+     we only add such a skip if we have chosen to vectorize the epilogue.  */
   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
 		      ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
 				  bound_prolog + bound_epilog)
-		      : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
+		      : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+			 || vect_epilogues));
   /* Epilog loop must be executed if the number of iterations for epilog
      loop is known at compile time, otherwise we need to add a check at
      the end of vector loop and skip to the end of epilog loop.  */
@@ -2525,6 +2624,12 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 
   dump_user_location_t loop_loc = find_loop_location (loop);
   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
+  if (vect_epilogues)
+    /* Make sure to set the epilogue's epilogue scalar loop, such that we can
+       use the original scalar loop as remaining epilogue if necessary.  */
+    LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
+      = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
+
   if (prolog_peeling)
     {
       e = loop_preheader_edge (loop);
@@ -2571,6 +2676,15 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 	  scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
 	  scale_loop_profile (prolog, prob_prolog, bound_prolog);
 	}
+
+      /* Save original inits for each data_reference before advancing them with
+	 NITERS_PROLOG.  */
+      unsigned int i;
+      struct data_reference *dr;
+      vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
+      FOR_EACH_VEC_ELT (datarefs, i, dr)
+	orig_drs_init.safe_push (std::make_pair (dr, DR_OFFSET (dr)));
+
       /* Update init address of DRs.  */
       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
       /* Update niters for vector loop.  */
@@ -2605,8 +2719,15 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 			   "loop can't be duplicated to exit edge.\n");
 	  gcc_unreachable ();
 	}
-      /* Peel epilog and put it on exit edge of loop.  */
-      epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
+      /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
+	 said epilog then we should use a copy of the main loop as a starting
+	 point.  This loop may have already had some preliminary transformations
+	 to allow for more optimal vectorization, for example if-conversion.
+	 If we are not vectorizing the epilog then we should use the scalar loop
+	 as the transformations mentioned above make less or no sense when not
+	 vectorizing.  */
+      epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
+      epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
       if (!epilog)
 	{
 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
@@ -2635,6 +2756,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 					   guard_to, guard_bb,
 					   prob_vector.invert (),
 					   irred_flag);
+	  skip_e = guard_e;
 	  e = EDGE_PRED (guard_to, 0);
 	  e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
 	  slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
@@ -2656,7 +2778,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 	}
 
       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
-      tree niters_vector_mult_vf;
       /* If loop is peeled for non-zero constant times, now niters refers to
 	 orig_niters - prolog_peeling, it won't overflow even the orig_niters
 	 overflows.  */
@@ -2679,7 +2800,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
       /* Update IVs of original loop as if they were advanced by
 	 niters_vector_mult_vf steps.  */
       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
-      edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
+      update_e = skip_vector ? e : loop_preheader_edge (epilog);
       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
 					update_e);
 
@@ -2720,10 +2841,75 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
       adjust_vec_debug_stmts ();
       scev_reset ();
     }
+
+  if (vect_epilogues)
+    {
+      epilog->aux = epilogue_vinfo;
+      LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
+
+      loop_constraint_clear (epilog, LOOP_C_INFINITE);
+
+      /* We now must calculate the number of NITERS performed by the previous
+	 loop and EPILOGUE_NITERS to be performed by the epilogue.  */
+      tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
+				 niters_prolog, niters_vector_mult_vf);
+
+      /* If skip_vector we may skip the previous loop, we insert a phi-node to
+	 determine whether we are coming from the previous vectorized loop
+	 using the update_e edge or the skip_vector basic block using the
+	 skip_e edge.  */
+      if (skip_vector)
+	{
+	  gcc_assert (update_e != NULL && skip_e != NULL);
+	  gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
+					   update_e->dest);
+	  tree new_ssa = make_ssa_name (TREE_TYPE (niters));
+	  gimple *stmt = gimple_build_assign (new_ssa, niters);
+	  gimple_stmt_iterator gsi;
+	  if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
+	      && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
+	    {
+	      gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
+	      gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
+	    }
+	  else
+	    {
+	      gsi = gsi_last_bb (update_e->src);
+	      gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
+	    }
+
+	  niters = new_ssa;
+	  add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
+	  add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
+		       UNKNOWN_LOCATION);
+	  niters = PHI_RESULT (new_phi);
+	}
+
+      /* Subtract the number of iterations performed by the vectorized loop
+	 from the number of total iterations.  */
+      tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
+					  before_loop_niters,
+					  niters);
+
+      LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
+      LOOP_VINFO_NITERSM1 (epilogue_vinfo)
+	= fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
+		       epilogue_niters,
+		       build_one_cst (TREE_TYPE (epilogue_niters)));
+
+      /* Set ADVANCE to the number of iterations performed by the previous
+	 loop and its prologue.  */
+      *advance = niters;
+
+      /* Redo the peeling for niter analysis as the NITERs and alignment
+	 may have been updated to take the main loop into account.  */
+      determine_peel_for_niter (epilogue_vinfo);
+    }
+
   adjust_vec.release ();
   free_original_copy_tables ();
 
-  return epilog;
+  return vect_epilogues ? epilog : NULL;
 }
 
 /* Function vect_create_cond_for_niters_checks.
@@ -2987,9 +3173,7 @@ vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
    *COND_EXPR_STMT_LIST.  */
 
 struct loop *
-vect_loop_versioning (loop_vec_info loop_vinfo,
-		      unsigned int th, bool check_profitability,
-		      poly_uint64 versioning_threshold)
+vect_loop_versioning (loop_vec_info loop_vinfo)
 {
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
@@ -3009,10 +3193,15 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
+  poly_uint64 versioning_threshold
+    = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
   tree version_simd_if_cond
     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
+  unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
 
-  if (check_profitability)
+  if (th >= vect_vf_for_cost (loop_vinfo)
+      && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && !ordered_p (th, versioning_threshold))
     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
 			     build_int_cst (TREE_TYPE (scalar_loop_iters),
 					    th - 1));
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 0308b26b808..6cbdfd1ad1a 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -154,6 +154,8 @@ along with GCC; see the file COPYING3.  If not see
 */
 
 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
+static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
+					       bool *);
 
 /* Subroutine of vect_determine_vf_for_stmt that handles only one
    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
@@ -325,7 +327,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 				 "get vectype for scalar type:  %T\n",
 				 scalar_type);
 
-	      vectype = get_vectype_for_scalar_type (scalar_type);
+	      vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 	      if (!vectype)
 		return opt_result::failure_at (phi,
 					       "not vectorized: unsupported "
@@ -559,19 +561,19 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 		  && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 
       stmt_vec_info reduc_stmt_info
-	= vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
-				       &double_reduc, false);
+	= vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc);
       if (reduc_stmt_info)
         {
-          if (double_reduc)
-            {
-              if (dump_enabled_p ())
-                dump_printf_loc (MSG_NOTE, vect_location,
+	  STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
+	  STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
+	  if (double_reduc)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
 				 "Detected double reduction.\n");
 
               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
-	      STMT_VINFO_DEF_TYPE (reduc_stmt_info)
-		= vect_double_reduction_def;
+	      STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
             }
           else
             {
@@ -582,7 +584,6 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 				     "Detected vectorizable nested cycle.\n");
 
                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
-		  STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
                 }
               else
                 {
@@ -688,13 +689,16 @@ vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 	stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 	while (next)
 	  {
-	    if (! STMT_VINFO_IN_PATTERN_P (next))
+	    if (! STMT_VINFO_IN_PATTERN_P (next)
+		|| STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
 	      break;
 	    next = REDUC_GROUP_NEXT_ELEMENT (next);
 	  }
-	/* If not all stmt in the chain are patterns try to handle
-	   the chain without patterns.  */
-	if (! next)
+	/* If not all stmt in the chain are patterns or if we failed
+	   to update STMT_VINFO_REDUC_IDX try to handle the chain
+	   without patterns.  */
+	if (! next
+	    && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
 	  {
 	    vect_fixup_reduc_chain (first);
 	    LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
@@ -730,9 +734,7 @@ vect_get_loop_niters (struct loop *loop, tree *assumptions,
   if (!exit)
     return cond;
 
-  niter = chrec_dont_know;
   may_be_zero = NULL_TREE;
-  niter_assumptions = boolean_true_node;
   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
       || chrec_contains_undetermined (niter_desc.niter))
     return cond;
@@ -826,6 +828,8 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
     ivexpr_map (NULL),
     slp_unrolling_factor (1),
     single_scalar_iteration_cost (0),
+    vec_outside_cost (0),
+    vec_inside_cost (0),
     vectorizable (false),
     can_fully_mask_p (true),
     fully_masked_p (false),
@@ -885,6 +889,8 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 	    }
 	}
     }
+
+  epilogue_vinfos.create (6);
 }
 
 /* Free all levels of MASKS.  */
@@ -959,6 +965,7 @@ _loop_vec_info::~_loop_vec_info ()
 
   release_vec_loop_masks (&masks);
   delete ivexpr_map;
+  epilogue_vinfos.release ();
 
   loop->aux = NULL;
 }
@@ -1431,8 +1438,8 @@ vect_update_vf_for_slp (loop_vec_info loop_vinfo)
 	dump_printf_loc (MSG_NOTE, vect_location,
 			 "Loop contains SLP and non-SLP stmts\n");
       /* Both the vectorization factor and unroll factor have the form
-	 current_vector_size * X for some rational X, so they must have
-	 a common multiple.  */
+	 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
+	 so they must have a common multiple.  */
       vectorization_factor
 	= force_common_multiple (vectorization_factor,
 				 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
@@ -1535,12 +1542,18 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
                   phi_op = PHI_ARG_DEF (phi, 0);
 		  stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
 		  if (!op_def_info)
-		    return opt_result::failure_at (phi, "unsupported phi");
+		    return opt_result::failure_at (phi, "unsupported phi\n");
 
 		  if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
 		      && (STMT_VINFO_RELEVANT (op_def_info)
 			  != vect_used_in_outer_by_reduction))
-		    return opt_result::failure_at (phi, "unsupported phi");
+		    return opt_result::failure_at (phi, "unsupported phi\n");
+
+		  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
+		       || (STMT_VINFO_DEF_TYPE (stmt_info)
+			   == vect_double_reduction_def))
+		      && !vectorizable_lc_phi (stmt_info, NULL, NULL))
+		    return opt_result::failure_at (phi, "unsupported phi\n");
                 }
 
               continue;
@@ -1564,18 +1577,19 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
 		ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
 					     &cost_vec);
 	      else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
+			|| (STMT_VINFO_DEF_TYPE (stmt_info)
+			    == vect_double_reduction_def)
 			|| STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
 		       && ! PURE_SLP_STMT (stmt_info))
-		ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
-					     &cost_vec);
+		ok = vectorizable_reduction (stmt_info, NULL, NULL, &cost_vec);
             }
 
 	  /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
 	  if (ok
 	      && STMT_VINFO_LIVE_P (stmt_info)
 	      && !PURE_SLP_STMT (stmt_info))
-	    ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
-					      &cost_vec);
+	    ok = vectorizable_live_operation (stmt_info, NULL, NULL, NULL,
+					      -1, NULL, &cost_vec);
 
           if (!ok)
 	    return opt_result::failure_at (phi,
@@ -1692,9 +1706,20 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
       return 0;
     }
 
-  HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
-  if (estimated_niter == -1)
-    estimated_niter = likely_max_stmt_executions_int (loop);
+  HOST_WIDE_INT estimated_niter;
+
+  /* If we are vectorizing an epilogue then we know the maximum number of
+     scalar iterations it will cover is at least one lower than the
+     vectorization factor of the main loop.  */
+  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+    estimated_niter
+      = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
+  else
+    {
+      estimated_niter = estimated_stmt_executions_int (loop);
+      if (estimated_niter == -1)
+	estimated_niter = likely_max_stmt_executions_int (loop);
+    }
   if (estimated_niter != -1
       && ((unsigned HOST_WIDE_INT) estimated_niter
 	  < MAX (th, (unsigned) min_profitable_estimate)))
@@ -1774,6 +1799,101 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
   return opt_result::success ();
 }
 
+/* Look for SLP-only access groups and turn each individual access into its own
+   group.  */
+static void
+vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
+{
+  unsigned int i;
+  struct data_reference *dr;
+
+  DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
+
+  vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
+  FOR_EACH_VEC_ELT (datarefs, i, dr)
+    {
+      gcc_assert (DR_REF (dr));
+      stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
+
+      /* Check if the load is a part of an interleaving chain.  */
+      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+	{
+	  stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
+	  unsigned int group_size = DR_GROUP_SIZE (first_element);
+
+	  /* Check if SLP-only groups.  */
+	  if (!STMT_SLP_TYPE (stmt_info)
+	      && STMT_VINFO_SLP_VECT_ONLY (first_element))
+	    {
+	      /* Dissolve the group.  */
+	      STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
+
+	      stmt_vec_info vinfo = first_element;
+	      while (vinfo)
+		{
+		  stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
+		  DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
+		  DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
+		  DR_GROUP_SIZE (vinfo) = 1;
+		  DR_GROUP_GAP (vinfo) = group_size - 1;
+		  vinfo = next;
+		}
+	    }
+	}
+    }
+}
+
+
+/* Decides whether we need to create an epilogue loop to handle
+   remaining scalar iterations and sets PEELING_FOR_NITERS accordingly.  */
+
+void
+determine_peel_for_niter (loop_vec_info loop_vinfo)
+{
+  LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
+
+  unsigned HOST_WIDE_INT const_vf;
+  HOST_WIDE_INT max_niter
+    = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
+
+  unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
+  if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
+    th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
+					  (loop_vinfo));
+
+  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    /* The main loop handles all iterations.  */
+    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
+  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+	   && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
+    {
+      /* Work out the (constant) number of iterations that need to be
+	 peeled for reasons other than niters.  */
+      unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+	peel_niter += 1;
+      if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
+		       LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
+	LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
+    }
+  else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+	   /* ??? When peeling for gaps but not alignment, we could
+	      try to check whether the (variable) niters is known to be
+	      VF * N + 1.  That's something of a niche case though.  */
+	   || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+	   || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
+	   || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
+		< (unsigned) exact_log2 (const_vf))
+	       /* In case of versioning, check if the maximum number of
+		  iterations is greater than th.  If they are identical,
+		  the epilogue is unnecessary.  */
+	       && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+		   || ((unsigned HOST_WIDE_INT) max_niter
+		       > (th / const_vf) * const_vf))))
+    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
+}
+
+
 /* Function vect_analyze_loop_2.
 
    Apply a set of analyses on LOOP, and create a loop_vec_info struct
@@ -1786,6 +1906,15 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
   int res;
   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
   poly_uint64 min_vf = 2;
+  loop_vec_info orig_loop_vinfo = NULL;
+
+  /* If we are dealing with an epilogue then orig_loop_vinfo points to the
+     loop_vec_info of the first vectorized loop.  */
+  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+    orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
+  else
+    orig_loop_vinfo = loop_vinfo;
+  gcc_assert (orig_loop_vinfo);
 
   /* The first group of checks is independent of the vector size.  */
   fatal = true;
@@ -1824,7 +1953,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
   /* Analyze the data references and also adjust the minimal
      vectorization factor according to the loads and stores.  */
 
-  ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
+  ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
   if (!ok)
     {
       if (dump_enabled_p ())
@@ -1855,7 +1984,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
 
   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
 
-  ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
+  ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
   if (!ok)
     {
       if (dump_enabled_p ())
@@ -1901,7 +2030,6 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
   vect_compute_single_scalar_iteration_cost (loop_vinfo);
 
   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-  unsigned th;
 
   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
@@ -1941,9 +2069,6 @@ start_over:
 		   LOOP_VINFO_INT_NITERS (loop_vinfo));
     }
 
-  HOST_WIDE_INT max_niter
-    = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
-
   /* Analyze the alignment of the data-refs in the loop.
      Fail if a data reference is found that cannot be vectorized.  */
 
@@ -1990,6 +2115,9 @@ start_over:
 	}
     }
 
+  /* Dissolve SLP-only groups.  */
+  vect_dissolve_slp_only_groups (loop_vinfo);
+
   /* Scan all the remaining operations in the loop that are not subject
      to SLP and make sure they are vectorizable.  */
   ok = vect_analyze_loop_operations (loop_vinfo);
@@ -2032,6 +2160,16 @@ start_over:
 				       " support peeling for gaps.\n");
     }
 
+  /* If we're vectorizing an epilogue loop, we either need a fully-masked
+     loop or a loop that has a lower VF than the main loop.  */
+  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+      && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+      && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+		   LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
+    return opt_result::failure_at (vect_location,
+				   "Vectorization factor too high for"
+				   " epilogue loop.\n");
+
   /* Check the costings of the loop make vectorizing worthwhile.  */
   res = vect_analyze_loop_costing (loop_vinfo);
   if (res < 0)
@@ -2044,42 +2182,7 @@ start_over:
     return opt_result::failure_at (vect_location,
 				   "Loop costings not worthwhile.\n");
 
-  /* Decide whether we need to create an epilogue loop to handle
-     remaining scalar iterations.  */
-  th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
-
-  unsigned HOST_WIDE_INT const_vf;
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
-    /* The main loop handles all iterations.  */
-    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
-  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-	   && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
-    {
-      /* Work out the (constant) number of iterations that need to be
-	 peeled for reasons other than niters.  */
-      unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
-      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
-	peel_niter += 1;
-      if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
-		       LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
-	LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
-    }
-  else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
-	   /* ??? When peeling for gaps but not alignment, we could
-	      try to check whether the (variable) niters is known to be
-	      VF * N + 1.  That's something of a niche case though.  */
-	   || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
-	   || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
-	   || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
-		< (unsigned) exact_log2 (const_vf))
-	       /* In case of versioning, check if the maximum number of
-		  iterations is greater than th.  If they are identical,
-		  the epilogue is unnecessary.  */
-	       && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
-		   || ((unsigned HOST_WIDE_INT) max_niter
-		       > (th / const_vf) * const_vf))))
-    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
-
+  determine_peel_for_niter (loop_vinfo);
   /* If an epilogue loop is required make sure we can create one.  */
   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
@@ -2101,10 +2204,21 @@ start_over:
   /* During peeling, we need to check if number of loop iterations is
      enough for both peeled prolog loop and vector loop.  This check
      can be merged along with threshold check of loop versioning, so
-     increase threshold for this case if necessary.  */
-  if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
+     increase threshold for this case if necessary.
+
+     If we are analyzing an epilogue we still want to check what its
+     versioning threshold would be.  If we decide to vectorize the epilogues we
+     will want to use the lowest versioning threshold of all epilogues and main
+     loop.  This will enable us to enter a vectorized epilogue even when
+     versioning the loop.  We can't simply check whether the epilogue requires
+     versioning though since we may have skipped some versioning checks when
+     analyzing the epilogue.  For instance, checks for alias versioning will be
+     skipped when dealing with epilogues as we assume we already checked them
+     for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
+  if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
     {
       poly_uint64 niters_th = 0;
+      unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
 
       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
 	{
@@ -2125,6 +2239,14 @@ start_over:
       /* One additional iteration because of peeling for gap.  */
       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
 	niters_th += 1;
+
+      /*  Use the same condition as vect_transform_loop to decide when to use
+	  the cost to determine a versioning threshold.  */
+      if (th >= vect_vf_for_cost (loop_vinfo)
+	  && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+	  && ordered_p (th, niters_th))
+	niters_th = ordered_max (poly_uint64 (th), niters_th);
+
       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
     }
 
@@ -2240,22 +2362,95 @@ again:
   goto start_over;
 }
 
+/* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
+   to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
+   OLD_LOOP_VINFO is better unless something specifically indicates
+   otherwise.
+
+   Note that this deliberately isn't a partial order.  */
+
+static bool
+vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
+			  loop_vec_info old_loop_vinfo)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
+  gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
+
+  poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
+  poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
+
+  /* Always prefer a VF of loop->simdlen over any other VF.  */
+  if (loop->simdlen)
+    {
+      bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
+      bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
+      if (new_simdlen_p != old_simdlen_p)
+	return new_simdlen_p;
+    }
+
+  /* Limit the VFs to what is likely to be the maximum number of iterations,
+     to handle cases in which at least one loop_vinfo is fully-masked.  */
+  HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
+  if (estimated_max_niter != -1)
+    {
+      if (known_le (estimated_max_niter, new_vf))
+	new_vf = estimated_max_niter;
+      if (known_le (estimated_max_niter, old_vf))
+	old_vf = estimated_max_niter;
+    }
+
+  /* Check whether the (fractional) cost per scalar iteration is lower
+     or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
+  poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
+			     * poly_widest_int (old_vf));
+  poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
+			     * poly_widest_int (new_vf));
+  if (maybe_lt (rel_old, rel_new))
+    return false;
+  if (known_lt (rel_new, rel_old))
+    return true;
+
+  /* If there's nothing to choose between the loop bodies, see whether
+     there's a difference in the prologue and epilogue costs.  */
+  if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
+    return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
+
+  return false;
+}
+
+/* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
+   true if we should.  */
+
+static bool
+vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
+			loop_vec_info old_loop_vinfo)
+{
+  if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
+    return false;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "***** Preferring vector mode %s to vector mode %s\n",
+		     GET_MODE_NAME (new_loop_vinfo->vector_mode),
+		     GET_MODE_NAME (old_loop_vinfo->vector_mode));
+  return true;
+}
+
 /* Function vect_analyze_loop.
 
    Apply a set of analyses on LOOP, and create a loop_vec_info struct
    for it.  The different analyses will record information in the
-   loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
-   be vectorized.  */
+   loop_vec_info struct.  */
 opt_loop_vec_info
-vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
-		   vec_info_shared *shared)
+vect_analyze_loop (struct loop *loop, vec_info_shared *shared)
 {
-  auto_vector_sizes vector_sizes;
+  auto_vector_modes vector_modes;
 
   /* Autodetect first vector size we try.  */
-  current_vector_size = 0;
-  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
-  unsigned int next_size = 0;
+  unsigned int autovec_flags
+    = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
+						    loop->simdlen != 0);
+  unsigned int mode_i = 0;
 
   DUMP_VECT_SCOPE ("analyze_loop_nest");
 
@@ -2272,58 +2467,221 @@ vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
        " loops cannot be vectorized\n");
 
   unsigned n_stmts = 0;
-  poly_uint64 autodetected_vector_size = 0;
+  machine_mode autodetected_vector_mode = VOIDmode;
+  opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
+  machine_mode next_vector_mode = VOIDmode;
+  poly_uint64 lowest_th = 0;
+  unsigned vectorized_loops = 0;
+  bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
+			     && !unlimited_cost_model (loop));
+
+  bool vect_epilogues = false;
+  opt_result res = opt_result::success ();
+  unsigned HOST_WIDE_INT simdlen = loop->simdlen;
   while (1)
     {
       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
-      opt_loop_vec_info loop_vinfo
-	= vect_analyze_loop_form (loop, shared);
+      opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
       if (!loop_vinfo)
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			     "bad loop form.\n");
+	  gcc_checking_assert (first_loop_vinfo == NULL);
 	  return loop_vinfo;
 	}
+      loop_vinfo->vector_mode = next_vector_mode;
 
       bool fatal = false;
 
-      if (orig_loop_vinfo)
-	LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
+      /* When pick_lowest_cost_p is true, we should in principle iterate
+	 over all the loop_vec_infos that LOOP_VINFO could replace and
+	 try to vectorize LOOP_VINFO under the same conditions.
+	 E.g. when trying to replace an epilogue loop, we should vectorize
+	 LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
+	 to replace the main loop, we should vectorize LOOP_VINFO as a main
+	 loop too.
+
+	 However, autovectorize_vector_modes is usually sorted as follows:
+
+	 - Modes that naturally produce lower VFs usually follow modes that
+	   naturally produce higher VFs.
+
+	 - When modes naturally produce the same VF, maskable modes
+	   usually follow unmaskable ones, so that the maskable mode
+	   can be used to vectorize the epilogue of the unmaskable mode.
+
+	 This order is preferred because it leads to the maximum
+	 epilogue vectorization opportunities.  Targets should only use
+	 a different order if they want to make wide modes available while
+	 disparaging them relative to earlier, smaller modes.  The assumption
+	 in that case is that the wider modes are more expensive in some
+	 way that isn't reflected directly in the costs.
+
+	 There should therefore be few interesting cases in which
+	 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
+	 treated as a standalone loop, and ends up being genuinely cheaper
+	 than FIRST_LOOP_VINFO.  */
+      if (vect_epilogues)
+	LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
+
+      res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
+      if (mode_i == 0)
+	autodetected_vector_mode = loop_vinfo->vector_mode;
+      if (dump_enabled_p ())
+	{
+	  if (res)
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "***** Analysis succeeded with vector mode %s\n",
+			     GET_MODE_NAME (loop_vinfo->vector_mode));
+	  else
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "***** Analysis failed with vector mode %s\n",
+			     GET_MODE_NAME (loop_vinfo->vector_mode));
+	}
+
+      loop->aux = NULL;
+
+      if (!fatal)
+	while (mode_i < vector_modes.length ()
+	       && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
+	  {
+	    if (dump_enabled_p ())
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "***** The result for vector mode %s would"
+			       " be the same\n",
+			       GET_MODE_NAME (vector_modes[mode_i]));
+	    mode_i += 1;
+	  }
 
-      opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
       if (res)
 	{
 	  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
+	  vectorized_loops++;
 
-	  return loop_vinfo;
-	}
-
-      delete loop_vinfo;
+	  /* Once we hit the desired simdlen for the first time,
+	     discard any previous attempts.  */
+	  if (simdlen
+	      && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
+	    {
+	      delete first_loop_vinfo;
+	      first_loop_vinfo = opt_loop_vec_info::success (NULL);
+	      LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
+	      simdlen = 0;
+	    }
+	  else if (pick_lowest_cost_p && first_loop_vinfo)
+	    {
+	      /* Keep trying to roll back vectorization attempts while the
+		 loop_vec_infos they produced were worse than this one.  */
+	      vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
+	      while (!vinfos.is_empty ()
+		     && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
+		{
+		  gcc_assert (vect_epilogues);
+		  delete vinfos.pop ();
+		}
+	      if (vinfos.is_empty ()
+		  && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
+		{
+		  delete first_loop_vinfo;
+		  first_loop_vinfo = opt_loop_vec_info::success (NULL);
+		  LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
+		}
+	    }
 
-      if (next_size == 0)
-	autodetected_vector_size = current_vector_size;
+	  if (first_loop_vinfo == NULL)
+	    {
+	      first_loop_vinfo = loop_vinfo;
+	      lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
+	    }
+	  else if (vect_epilogues
+		   /* For now only allow one epilogue loop.  */
+		   && first_loop_vinfo->epilogue_vinfos.is_empty ())
+	    {
+	      first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
+	      poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
+	      gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+			  || maybe_ne (lowest_th, 0U));
+	      /* Keep track of the known smallest versioning
+		 threshold.  */
+	      if (ordered_p (lowest_th, th))
+		lowest_th = ordered_min (lowest_th, th);
+	    }
+	  else
+	    delete loop_vinfo;
+
+	  /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
+	     enabled, SIMDUID is not set, it is the innermost loop and we have
+	     either already found the loop's SIMDLEN or there was no SIMDLEN to
+	     begin with.
+	     TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
+	  vect_epilogues = (!simdlen
+			    && loop->inner == NULL
+			    && PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK)
+			    && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
+			    && !loop->simduid
+			    /* For now only allow one epilogue loop, but allow
+			       pick_lowest_cost_p to replace it.  */
+			    && (first_loop_vinfo->epilogue_vinfos.is_empty ()
+				|| pick_lowest_cost_p));
+
+	  /* Commit to first_loop_vinfo if we have no reason to try
+	     alternatives.  */
+	  if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
+	    break;
+	}
+      else
+	{
+	  delete loop_vinfo;
+	  if (fatal)
+	    {
+	      gcc_checking_assert (first_loop_vinfo == NULL);
+	      break;
+	    }
+	}
 
-      if (next_size < vector_sizes.length ()
-	  && known_eq (vector_sizes[next_size], autodetected_vector_size))
-	next_size += 1;
+      if (mode_i < vector_modes.length ()
+	  && VECTOR_MODE_P (autodetected_vector_mode)
+	  && (related_vector_mode (vector_modes[mode_i],
+				   GET_MODE_INNER (autodetected_vector_mode))
+	      == autodetected_vector_mode)
+	  && (related_vector_mode (autodetected_vector_mode,
+				   GET_MODE_INNER (vector_modes[mode_i]))
+	      == vector_modes[mode_i]))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "***** Skipping vector mode %s, which would"
+			     " repeat the analysis for %s\n",
+			     GET_MODE_NAME (vector_modes[mode_i]),
+			     GET_MODE_NAME (autodetected_vector_mode));
+	  mode_i += 1;
+	}
 
-      if (fatal
-	  || next_size == vector_sizes.length ()
-	  || known_eq (current_vector_size, 0U))
-	return opt_loop_vec_info::propagate_failure (res);
+      if (mode_i == vector_modes.length ()
+	  || autodetected_vector_mode == VOIDmode)
+	break;
 
       /* Try the next biggest vector size.  */
-      current_vector_size = vector_sizes[next_size++];
+      next_vector_mode = vector_modes[mode_i++];
       if (dump_enabled_p ())
-	{
-	  dump_printf_loc (MSG_NOTE, vect_location,
-			   "***** Re-trying analysis with "
-			   "vector size ");
-	  dump_dec (MSG_NOTE, current_vector_size);
-	  dump_printf (MSG_NOTE, "\n");
-	}
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "***** Re-trying analysis with vector mode %s\n",
+			 GET_MODE_NAME (next_vector_mode));
+    }
+
+  if (first_loop_vinfo)
+    {
+      loop->aux = (loop_vec_info) first_loop_vinfo;
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "***** Choosing vector mode %s\n",
+			 GET_MODE_NAME (first_loop_vinfo->vector_mode));
+      LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
+      return first_loop_vinfo;
     }
+
+  return opt_loop_vec_info::propagate_failure (res);
 }
 
 /* Return true if there is an in-order reduction function for CODE, storing
@@ -2397,17 +2755,17 @@ reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
 
 /* If there is a neutral value X such that SLP reduction NODE would not
    be affected by the introduction of additional X elements, return that X,
-   otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
-   is true if the SLP statements perform a single reduction, false if each
-   statement performs an independent reduction.  */
+   otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
+   is the vector type that would hold element X.  REDUC_CHAIN is true if
+   the SLP statements perform a single reduction, false if each statement
+   performs an independent reduction.  */
 
 static tree
-neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
-			      bool reduc_chain)
+neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
+			      tree_code code, bool reduc_chain)
 {
   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
   stmt_vec_info stmt_vinfo = stmts[0];
-  tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
   tree scalar_type = TREE_TYPE (vector_type);
   struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
   gcc_assert (loop);
@@ -2453,241 +2811,55 @@ report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
 }
 
-/* DEF_STMT_INFO occurs in a loop that contains a potential reduction
-   operation.  Return true if the results of DEF_STMT_INFO are something
-   that can be accumulated by such a reduction.  */
+/* Return true if we need an in-order reduction for operation CODE
+   on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
+   overflow must wrap.  */
 
-static bool
-vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
+bool
+needs_fold_left_reduction_p (tree type, tree_code code)
 {
-  return (is_gimple_assign (def_stmt_info->stmt)
-	  || is_gimple_call (def_stmt_info->stmt)
-	  || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
-	  || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
-	      && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
-	      && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
-}
+  /* CHECKME: check for !flag_finite_math_only too?  */
+  if (SCALAR_FLOAT_TYPE_P (type))
+    switch (code)
+      {
+      case MIN_EXPR:
+      case MAX_EXPR:
+	return false;
 
-/* Detect SLP reduction of the form:
+      default:
+	return !flag_associative_math;
+      }
 
-   #a1 = phi <a5, a0>
-   a2 = operation (a1)
-   a3 = operation (a2)
-   a4 = operation (a3)
-   a5 = operation (a4)
+  if (INTEGRAL_TYPE_P (type))
+    {
+      if (!operation_no_trapping_overflow (type, code))
+	return true;
+      return false;
+    }
 
-   #a = phi <a5>
+  if (SAT_FIXED_POINT_TYPE_P (type))
+    return true;
 
-   PHI is the reduction phi node (#a1 = phi <a5, a0> above)
-   FIRST_STMT is the first reduction stmt in the chain
-   (a2 = operation (a1)).
+  return false;
+}
 
-   Return TRUE if a reduction chain was detected.  */
+/* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
+   has a handled computation expression.  Store the main reduction
+   operation in *CODE.  */
 
 static bool
-vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
-		       gimple *first_stmt)
+check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
+		      tree loop_arg, enum tree_code *code,
+		      vec<std::pair<ssa_op_iter, use_operand_p> > &path)
 {
-  struct loop *loop = (gimple_bb (phi))->loop_father;
-  struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
-  enum tree_code code;
-  gimple *loop_use_stmt = NULL;
-  stmt_vec_info use_stmt_info;
-  tree lhs;
-  imm_use_iterator imm_iter;
-  use_operand_p use_p;
-  int nloop_uses, size = 0, n_out_of_loop_uses;
-  bool found = false;
-
-  if (loop != vect_loop)
-    return false;
-
-  auto_vec<stmt_vec_info, 8> reduc_chain;
-  lhs = PHI_RESULT (phi);
-  code = gimple_assign_rhs_code (first_stmt);
-  while (1)
-    {
-      nloop_uses = 0;
-      n_out_of_loop_uses = 0;
-      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
-        {
-	  gimple *use_stmt = USE_STMT (use_p);
-	  if (is_gimple_debug (use_stmt))
-	    continue;
-
-          /* Check if we got back to the reduction phi.  */
-	  if (use_stmt == phi)
-            {
-	      loop_use_stmt = use_stmt;
-              found = true;
-              break;
-            }
-
-          if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
-            {
-	      loop_use_stmt = use_stmt;
-	      nloop_uses++;
-            }
-           else
-             n_out_of_loop_uses++;
-
-           /* There are can be either a single use in the loop or two uses in
-              phi nodes.  */
-           if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
-             return false;
-        }
-
-      if (found)
-        break;
-
-      /* We reached a statement with no loop uses.  */
-      if (nloop_uses == 0)
-	return false;
-
-      /* This is a loop exit phi, and we haven't reached the reduction phi.  */
-      if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
-        return false;
-
-      if (!is_gimple_assign (loop_use_stmt)
-	  || code != gimple_assign_rhs_code (loop_use_stmt)
-	  || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
-        return false;
-
-      /* Insert USE_STMT into reduction chain.  */
-      use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
-      reduc_chain.safe_push (use_stmt_info);
-
-      lhs = gimple_assign_lhs (loop_use_stmt);
-      size++;
-   }
-
-  if (!found || loop_use_stmt != phi || size < 2)
-    return false;
-
-  /* Swap the operands, if needed, to make the reduction operand be the second
-     operand.  */
-  lhs = PHI_RESULT (phi);
-  for (unsigned i = 0; i < reduc_chain.length (); ++i)
-    {
-      gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
-      if (gimple_assign_rhs2 (next_stmt) == lhs)
-	{
-	  tree op = gimple_assign_rhs1 (next_stmt);
-	  stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
-
-	  /* Check that the other def is either defined in the loop
-	     ("vect_internal_def"), or it's an induction (defined by a
-	     loop-header phi-node).  */
-	  if (def_stmt_info
-	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
-	      && vect_valid_reduction_input_p (def_stmt_info))
-	    {
-	      lhs = gimple_assign_lhs (next_stmt);
- 	      continue;
-	    }
-
-	  return false;
-	}
-      else
-	{
-          tree op = gimple_assign_rhs2 (next_stmt);
-	  stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
-
-          /* Check that the other def is either defined in the loop
-            ("vect_internal_def"), or it's an induction (defined by a
-            loop-header phi-node).  */
-	  if (def_stmt_info
-	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
-	      && vect_valid_reduction_input_p (def_stmt_info))
-  	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
-				 next_stmt);
-
-	      swap_ssa_operands (next_stmt,
-	 		         gimple_assign_rhs1_ptr (next_stmt),
-                                 gimple_assign_rhs2_ptr (next_stmt));
-	      update_stmt (next_stmt);
-
-	      if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
-		LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
-	    }
-	  else
-	    return false;
-        }
-
-      lhs = gimple_assign_lhs (next_stmt);
-    }
-
-  /* Build up the actual chain.  */
-  for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
-    {
-      REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
-      REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
-    }
-  REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
-  REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
-
-  /* Save the chain for further analysis in SLP detection.  */
-  LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
-  REDUC_GROUP_SIZE (reduc_chain[0]) = size;
-
-  return true;
-}
-
-/* Return true if we need an in-order reduction for operation CODE
-   on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
-   overflow must wrap.  */
-
-static bool
-needs_fold_left_reduction_p (tree type, tree_code code,
-			     bool need_wrapping_integral_overflow)
-{
-  /* CHECKME: check for !flag_finite_math_only too?  */
-  if (SCALAR_FLOAT_TYPE_P (type))
-    switch (code)
-      {
-      case MIN_EXPR:
-      case MAX_EXPR:
-	return false;
-
-      default:
-	return !flag_associative_math;
-      }
-
-  if (INTEGRAL_TYPE_P (type))
-    {
-      if (!operation_no_trapping_overflow (type, code))
-	return true;
-      if (need_wrapping_integral_overflow
-	  && !TYPE_OVERFLOW_WRAPS (type)
-	  && operation_can_overflow (code))
-	return true;
-      return false;
-    }
-
-  if (SAT_FIXED_POINT_TYPE_P (type))
-    return true;
-
-  return false;
-}
-
-/* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
-   reduction operation CODE has a handled computation expression.  */
-
-bool
-check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
-		      tree loop_arg, enum tree_code code)
-{
-  auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
-  auto_bitmap visited;
-  tree lookfor = PHI_RESULT (phi);
-  ssa_op_iter curri;
-  use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
-  while (USE_FROM_PTR (curr) != loop_arg)
-    curr = op_iter_next_use (&curri);
-  curri.i = curri.numops;
-  do
+  auto_bitmap visited;
+  tree lookfor = PHI_RESULT (phi);
+  ssa_op_iter curri;
+  use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
+  while (USE_FROM_PTR (curr) != loop_arg)
+    curr = op_iter_next_use (&curri);
+  curri.i = curri.numops;
+  do
     {
       path.safe_push (std::make_pair (curri, curr));
       tree use = USE_FROM_PTR (curr);
@@ -2747,36 +2919,71 @@ pop:
   /* Check whether the reduction path detected is valid.  */
   bool fail = path.length () == 0;
   bool neg = false;
+  *code = ERROR_MARK;
   for (unsigned i = 1; i < path.length (); ++i)
     {
       gimple *use_stmt = USE_STMT (path[i].second);
       tree op = USE_FROM_PTR (path[i].second);
-      if (! has_single_use (op)
-	  || ! is_gimple_assign (use_stmt))
+      if (! is_gimple_assign (use_stmt)
+	  /* The following make sure we can compute the operand index
+	     easily plus it mostly disallows chaining via COND_EXPR condition
+	     operands.  */
+	  || (gimple_assign_rhs1 (use_stmt) != op
+	      && gimple_assign_rhs2 (use_stmt) != op
+	      && gimple_assign_rhs3 (use_stmt) != op))
 	{
 	  fail = true;
 	  break;
 	}
-      if (gimple_assign_rhs_code (use_stmt) != code)
+      /* Check there's only a single stmt the op is used on inside
+         of the loop.  */
+      imm_use_iterator imm_iter;
+      gimple *op_use_stmt;
+      unsigned cnt = 0;
+      FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
+	if (!is_gimple_debug (op_use_stmt)
+	    && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
+	  cnt++;
+      if (cnt != 1)
 	{
-	  if (code == PLUS_EXPR
-	      && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
-	    {
-	      /* Track whether we negate the reduction value each iteration.  */
-	      if (gimple_assign_rhs2 (use_stmt) == op)
-		neg = ! neg;
-	    }
-	  else
-	    {
-	      fail = true;
-	      break;
-	    }
+	  fail = true;
+	  break;
+	}
+      tree_code use_code = gimple_assign_rhs_code (use_stmt);
+      if (use_code == MINUS_EXPR)
+	{
+	  use_code = PLUS_EXPR;
+	  /* Track whether we negate the reduction value each iteration.  */
+	  if (gimple_assign_rhs2 (use_stmt) == op)
+	    neg = ! neg;
+	}
+      if (CONVERT_EXPR_CODE_P (use_code)
+	  && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
+				    TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
+	;
+      else if (*code == ERROR_MARK)
+	*code = use_code;
+      else if (use_code != *code)
+	{
+	  fail = true;
+	  break;
 	}
     }
-  return ! fail && ! neg;
+  return ! fail && ! neg && *code != ERROR_MARK;
+}
+
+bool
+check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
+		      tree loop_arg, enum tree_code code)
+{
+  auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
+  enum tree_code code_;
+  return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
+	  && code_ == code);
 }
 
 
+
 /* Function vect_is_simple_reduction
 
    (1) Detect a cross-iteration def-use cycle that represents a simple
@@ -2823,25 +3030,15 @@ pop:
 
 static stmt_vec_info
 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
-			  bool *double_reduc,
-			  bool need_wrapping_integral_overflow,
-			  enum vect_reduction_type *v_reduc_type)
+			  bool *double_reduc)
 {
   gphi *phi = as_a <gphi *> (phi_info->stmt);
-  struct loop *loop = (gimple_bb (phi))->loop_father;
-  struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
-  bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
   gimple *phi_use_stmt = NULL;
-  enum tree_code orig_code, code;
-  tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
-  tree type;
-  tree name;
   imm_use_iterator imm_iter;
   use_operand_p use_p;
-  bool phi_def;
 
   *double_reduc = false;
-  *v_reduc_type = TREE_CODE_REDUCTION;
+  STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
 
   tree phi_name = PHI_RESULT (phi);
   /* ???  If there are no uses of the PHI result the inner loop reduction
@@ -2850,6 +3047,7 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
      can be constant.  See PR60382.  */
   if (has_zero_uses (phi_name))
     return NULL;
+  class loop *loop = (gimple_bb (phi))->loop_father;
   unsigned nphi_def_loop_uses = 0;
   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
     {
@@ -2870,44 +3068,26 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
       phi_use_stmt = use_stmt;
     }
 
-  edge latch_e = loop_latch_edge (loop);
-  tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
-  if (TREE_CODE (loop_arg) != SSA_NAME)
+  tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
+  if (TREE_CODE (latch_def) != SSA_NAME)
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "reduction: not ssa_name: %T\n", loop_arg);
+			 "reduction: not ssa_name: %T\n", latch_def);
       return NULL;
     }
 
-  stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
+  stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
   if (!def_stmt_info
       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
     return NULL;
 
-  if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
-    {
-      name = gimple_assign_lhs (def_stmt);
-      phi_def = false;
-    }
-  else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
-    {
-      name = PHI_RESULT (def_stmt);
-      phi_def = true;
-    }
-  else
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "reduction: unhandled reduction operation: %G",
-			 def_stmt_info->stmt);
-      return NULL;
-    }
-
+  bool nested_in_vect_loop
+    = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
   unsigned nlatch_def_loop_uses = 0;
   auto_vec<gphi *, 3> lcphis;
   bool inner_loop_of_double_reduc = false;
-  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
+  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
     {
       gimple *use_stmt = USE_STMT (use_p);
       if (is_gimple_debug (use_stmt))
@@ -2925,11 +3105,21 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
 	}
     }
 
+  /* If we are vectorizing an inner reduction we are executing that
+     in the original order only in case we are not dealing with a
+     double reduction.  */
+  if (nested_in_vect_loop && !inner_loop_of_double_reduc)
+    {
+      if (dump_enabled_p ())
+	report_vect_op (MSG_NOTE, def_stmt_info->stmt,
+			"detected nested cycle: ");
+      return def_stmt_info;
+    }
+
   /* If this isn't a nested cycle or if the nested cycle reduction value
      is used ouside of the inner loop we cannot handle uses of the reduction
      value.  */
-  if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
-      && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
+  if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -2939,11 +3129,9 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
 
   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
      defined in the inner loop.  */
-  if (phi_def)
+  if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
     {
-      gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
-      op1 = PHI_ARG_DEF (def_stmt, 0);
-
+      tree op1 = PHI_ARG_DEF (def_stmt, 0);
       if (gimple_phi_num_args (def_stmt) != 1
           || TREE_CODE (op1) != SSA_NAME)
         {
@@ -2974,290 +3162,74 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
       return NULL;
     }
 
-  /* If we are vectorizing an inner reduction we are executing that
-     in the original order only in case we are not dealing with a
-     double reduction.  */
-  bool check_reduction = true;
-  if (flow_loop_nested_p (vect_loop, loop))
-    {
-      gphi *lcphi;
-      unsigned i;
-      check_reduction = false;
-      FOR_EACH_VEC_ELT (lcphis, i, lcphi)
-	FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
-	  {
-	    gimple *use_stmt = USE_STMT (use_p);
-	    if (is_gimple_debug (use_stmt))
-	      continue;
-	    if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
-	      check_reduction = true;
-	  }
-    }
-
-  gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
-  code = orig_code = gimple_assign_rhs_code (def_stmt);
-
-  if (nested_in_vect_loop && !check_reduction)
-    {
-      /* FIXME: Even for non-reductions code generation is funneled
-	 through vectorizable_reduction for the stmt defining the
-	 PHI latch value.  So we have to artificially restrict ourselves
-	 for the supported operations.  */
-      switch (get_gimple_rhs_class (code))
-	{
-	case GIMPLE_BINARY_RHS:
-	case GIMPLE_TERNARY_RHS:
-	  break;
-	default:
-	  /* Not supported by vectorizable_reduction.  */
-	  if (dump_enabled_p ())
-	    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
-			    "nested cycle: not handled operation: ");
-	  return NULL;
-	}
-      if (dump_enabled_p ())
-	report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
-      return def_stmt_info;
-    }
-
-  /* We can handle "res -= x[i]", which is non-associative by
-     simply rewriting this into "res += -x[i]".  Avoid changing
-     gimple instruction for the first simple tests and only do this
-     if we're allowed to change code at all.  */
-  if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
-    code = PLUS_EXPR;
-
-  if (code == COND_EXPR)
+  /* Look for the expression computing latch_def from then loop PHI result.  */
+  auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
+  enum tree_code code;
+  if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
+			    path))
     {
-      if (! nested_in_vect_loop)
-	*v_reduc_type = COND_REDUCTION;
+      STMT_VINFO_REDUC_CODE (phi_info) = code;
+      if (code == COND_EXPR && !nested_in_vect_loop)
+	STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
 
-      op3 = gimple_assign_rhs1 (def_stmt);
-      if (COMPARISON_CLASS_P (op3))
-        {
-          op4 = TREE_OPERAND (op3, 1);
-          op3 = TREE_OPERAND (op3, 0);
-        }
-      if (op3 == phi_name || op4 == phi_name)
+      /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
+	 reduction chain for which the additional restriction is that
+	 all operations in the chain are the same.  */
+      auto_vec<stmt_vec_info, 8> reduc_chain;
+      unsigned i;
+      bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
+      for (i = path.length () - 1; i >= 1; --i)
 	{
-	  if (dump_enabled_p ())
-	    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
-			    "reduction: condition depends on previous"
-			    " iteration: ");
-	  return NULL;
+	  gimple *stmt = USE_STMT (path[i].second);
+	  stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
+	  STMT_VINFO_REDUC_IDX (stmt_info)
+	    = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
+	  enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
+	  bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
+				     && (i == 1 || i == path.length () - 1));
+	  if ((stmt_code != code && !leading_conversion)
+	      /* We can only handle the final value in epilogue
+		 generation for reduction chains.  */
+	      || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
+	    is_slp_reduc = false;
+	  /* For reduction chains we support a trailing/leading
+	     conversions.  We do not store those in the actual chain.  */
+	  if (leading_conversion)
+	    continue;
+	  reduc_chain.safe_push (stmt_info);
 	}
-
-      op1 = gimple_assign_rhs2 (def_stmt);
-      op2 = gimple_assign_rhs3 (def_stmt);
-    }
-  else if (!commutative_tree_code (code) || !associative_tree_code (code))
-    {
-      if (dump_enabled_p ())
-	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
-			"reduction: not commutative/associative: ");
-      return NULL;
-    }
-  else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
-    {
-      op1 = gimple_assign_rhs1 (def_stmt);
-      op2 = gimple_assign_rhs2 (def_stmt);
-    }
-  else
-    {
-      if (dump_enabled_p ())
-	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
-			"reduction: not handled operation: ");
-      return NULL;
-    }
-
-  if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
-    {
-      if (dump_enabled_p ())
-	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
-			"reduction: both uses not ssa_names: ");
-
-      return NULL;
-    }
-
-  type = TREE_TYPE (gimple_assign_lhs (def_stmt));
-  if ((TREE_CODE (op1) == SSA_NAME
-       && !types_compatible_p (type,TREE_TYPE (op1)))
-      || (TREE_CODE (op2) == SSA_NAME
-          && !types_compatible_p (type, TREE_TYPE (op2)))
-      || (op3 && TREE_CODE (op3) == SSA_NAME
-          && !types_compatible_p (type, TREE_TYPE (op3)))
-      || (op4 && TREE_CODE (op4) == SSA_NAME
-          && !types_compatible_p (type, TREE_TYPE (op4))))
-    {
-      if (dump_enabled_p ())
-        {
-          dump_printf_loc (MSG_NOTE, vect_location,
-			   "reduction: multiple types: operation type: "
-			   "%T, operands types: %T,%T",
-			   type,  TREE_TYPE (op1), TREE_TYPE (op2));
-          if (op3)
-	    dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
-
-          if (op4)
-	    dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
-          dump_printf (MSG_NOTE, "\n");
-        }
-
-      return NULL;
-    }
-
-  /* Check whether it's ok to change the order of the computation.
-     Generally, when vectorizing a reduction we change the order of the
-     computation.  This may change the behavior of the program in some
-     cases, so we need to check that this is ok.  One exception is when
-     vectorizing an outer-loop: the inner-loop is executed sequentially,
-     and therefore vectorizing reductions in the inner-loop during
-     outer-loop vectorization is safe.  */
-  if (check_reduction
-      && *v_reduc_type == TREE_CODE_REDUCTION
-      && needs_fold_left_reduction_p (type, code,
-				      need_wrapping_integral_overflow))
-    *v_reduc_type = FOLD_LEFT_REDUCTION;
-
-  /* Reduction is safe. We're dealing with one of the following:
-     1) integer arithmetic and no trapv
-     2) floating point arithmetic, and special flags permit this optimization
-     3) nested cycle (i.e., outer loop vectorization).  */
-  stmt_vec_info def1_info = loop_info->lookup_def (op1);
-  stmt_vec_info def2_info = loop_info->lookup_def (op2);
-  if (code != COND_EXPR && !def1_info && !def2_info)
-    {
-      if (dump_enabled_p ())
-	report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
-      return NULL;
-    }
-
-  /* Check that one def is the reduction def, defined by PHI,
-     the other def is either defined in the loop ("vect_internal_def"),
-     or it's an induction (defined by a loop-header phi-node).  */
-
-  if (def2_info
-      && def2_info->stmt == phi
-      && (code == COND_EXPR
-	  || !def1_info
-	  || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
-	  || vect_valid_reduction_input_p (def1_info)))
-    {
-      if (dump_enabled_p ())
-	report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
-      return def_stmt_info;
-    }
-
-  if (def1_info
-      && def1_info->stmt == phi
-      && (code == COND_EXPR
-	  || !def2_info
-	  || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
-	  || vect_valid_reduction_input_p (def2_info)))
-    {
-      if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
+      if (is_slp_reduc && reduc_chain.length () > 1)
 	{
-	  /* Check if we can swap operands (just for simplicity - so that
-	     the rest of the code can assume that the reduction variable
-	     is always the last (second) argument).  */
-	  if (code == COND_EXPR)
+	  for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
 	    {
-	      /* Swap cond_expr by inverting the condition.  */
-	      tree cond_expr = gimple_assign_rhs1 (def_stmt);
-	      enum tree_code invert_code = ERROR_MARK;
-	      enum tree_code cond_code = TREE_CODE (cond_expr);
-
-	      if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
-		{
-		  bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
-		  invert_code = invert_tree_comparison (cond_code, honor_nans);
-		}
-	      if (invert_code != ERROR_MARK)
-		{
-		  TREE_SET_CODE (cond_expr, invert_code);
-		  swap_ssa_operands (def_stmt,
-				     gimple_assign_rhs2_ptr (def_stmt),
-				     gimple_assign_rhs3_ptr (def_stmt));
-		}
-	      else
-		{
-		  if (dump_enabled_p ())
-		    report_vect_op (MSG_NOTE, def_stmt,
-				    "detected reduction: cannot swap operands "
-				    "for cond_expr");
-		  return NULL;
-		}
+	      REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
+	      REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
 	    }
-	  else
-	    swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
-			       gimple_assign_rhs2_ptr (def_stmt));
-
-	  if (dump_enabled_p ())
-	    report_vect_op (MSG_NOTE, def_stmt,
-			    "detected reduction: need to swap operands: ");
-
-	  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
-	    LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
-        }
-      else
-        {
-          if (dump_enabled_p ())
-            report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
-        }
+	  REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
+	  REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
 
-      return def_stmt_info;
-    }
+	  /* Save the chain for further analysis in SLP detection.  */
+	  LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
+	  REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
 
-  /* Try to find SLP reduction chain.  */
-  if (! nested_in_vect_loop
-      && code != COND_EXPR
-      && orig_code != MINUS_EXPR
-      && vect_is_slp_reduction (loop_info, phi, def_stmt))
-    {
-      if (dump_enabled_p ())
-        report_vect_op (MSG_NOTE, def_stmt,
-			"reduction: detected reduction chain: ");
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			    "reduction: detected reduction chain\n");
+	}
+      else if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "reduction: detected reduction\n");
 
       return def_stmt_info;
     }
 
-  /* Look for the expression computing loop_arg from loop PHI result.  */
-  if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
-    return def_stmt_info;
-
   if (dump_enabled_p ())
-    {
-      report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
-		      "reduction: unknown pattern: ");
-    }
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "reduction: unknown pattern\n");
 
   return NULL;
 }
 
-/* Wrapper around vect_is_simple_reduction, which will modify code
-   in-place if it enables detection of more reductions.  Arguments
-   as there.  */
-
-stmt_vec_info
-vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
-			     bool *double_reduc,
-			     bool need_wrapping_integral_overflow)
-{
-  enum vect_reduction_type v_reduc_type;
-  stmt_vec_info def_info
-    = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
-				need_wrapping_integral_overflow,
-				&v_reduc_type);
-  if (def_info)
-    {
-      STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
-      STMT_VINFO_REDUC_DEF (phi_info) = def_info;
-      STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
-      STMT_VINFO_REDUC_DEF (def_info) = phi_info;
-    }
-  return def_info;
-}
-
 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
 int
 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
@@ -3601,7 +3573,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 	       &vec_inside_cost, &vec_epilogue_cost);
 
   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
-  
+
+  /* Stash the costs so that we can compare two loop_vec_infos.  */
+  loop_vinfo->vec_inside_cost = vec_inside_cost;
+  loop_vinfo->vec_outside_cost = vec_outside_cost;
+
   if (dump_enabled_p ())
     {
       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
@@ -3846,6 +3822,7 @@ have_whole_vector_shift (machine_mode mode)
 
 static void
 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
+			   vect_reduction_type reduction_type,
 			   int ncopies, stmt_vector_for_cost *cost_vec)
 {
   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
@@ -3860,8 +3837,6 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
     loop = LOOP_VINFO_LOOP (loop_vinfo);
 
   /* Condition reductions generate two reductions in the loop.  */
-  vect_reduction_type reduction_type
-    = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
   if (reduction_type == COND_REDUCTION)
     ncopies *= 2;
 
@@ -4080,15 +4055,15 @@ vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
 
    A cost model should help decide between these two schemes.  */
 
-tree
-get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
+static tree
+get_initial_def_for_reduction (stmt_vec_info stmt_vinfo,
+			       enum tree_code code, tree init_val,
                                tree *adjustment_def)
 {
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   tree scalar_type = TREE_TYPE (init_val);
-  tree vectype = get_vectype_for_scalar_type (scalar_type);
-  enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
+  tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
   tree def_for_init;
   tree init_def;
   REAL_VALUE_TYPE real_init_val = dconst0;
@@ -4103,8 +4078,10 @@ get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
 	      || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
 
-  vect_reduction_type reduction_type
-    = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
+  /* ADJUSTMENT_DEF is NULL when called from
+     vect_create_epilog_for_reduction to vectorize double reduction.  */
+  if (adjustment_def)
+    *adjustment_def = NULL;
 
   switch (code)
     {
@@ -4118,11 +4095,6 @@ get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
     case MULT_EXPR:
     case BIT_AND_EXPR:
       {
-        /* ADJUSTMENT_DEF is NULL when called from
-           vect_create_epilog_for_reduction to vectorize double reduction.  */
-        if (adjustment_def)
-	  *adjustment_def = init_val;
-
         if (code == MULT_EXPR)
           {
             real_init_val = dconst1;
@@ -4137,10 +4109,14 @@ get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
         else
           def_for_init = build_int_cst (scalar_type, int_init_val);
 
-	if (adjustment_def)
-	  /* Option1: the first element is '0' or '1' as well.  */
-	  init_def = gimple_build_vector_from_val (&stmts, vectype,
-						   def_for_init);
+	if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
+	  {
+	    /* Option1: the first element is '0' or '1' as well.  */
+	    if (!operand_equal_p (def_for_init, init_val, 0))
+	      *adjustment_def = init_val;
+	    init_def = gimple_build_vector_from_val (&stmts, vectype,
+						     def_for_init);
+	  }
 	else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
 	  {
 	    /* Option2 (variable length): the first element is INIT_VAL.  */
@@ -4164,16 +4140,6 @@ get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
     case MAX_EXPR:
     case COND_EXPR:
       {
-	if (adjustment_def)
-          {
-	    *adjustment_def = NULL_TREE;
-	    if (reduction_type != COND_REDUCTION
-		&& reduction_type != EXTRACT_LAST_REDUCTION)
-	      {
-		init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
-		break;
-	      }
-	  }
 	init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
 	init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
       }
@@ -4201,6 +4167,7 @@ get_initial_defs_for_reduction (slp_tree slp_node,
 {
   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
   stmt_vec_info stmt_vinfo = stmts[0];
+  vec_info *vinfo = stmt_vinfo->vinfo;
   unsigned HOST_WIDE_INT nunits;
   unsigned j, number_of_places_left_in_vector;
   tree vector_type;
@@ -4293,7 +4260,7 @@ get_initial_defs_for_reduction (slp_tree slp_node,
 	    {
 	      /* First time round, duplicate ELTS to fill the
 		 required number of vectors.  */
-	      duplicate_and_interleave (&ctor_seq, vector_type, elts,
+	      duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
 					number_of_vectors, *vec_oprnds);
 	      break;
 	    }
@@ -4309,42 +4276,47 @@ get_initial_defs_for_reduction (slp_tree slp_node,
     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
 }
 
+/* For a statement STMT_INFO taking part in a reduction operation return
+   the stmt_vec_info the meta information is stored on.  */
 
-/* Function vect_create_epilog_for_reduction
-
-   Create code at the loop-epilog to finalize the result of a reduction
+stmt_vec_info
+info_for_reduction (stmt_vec_info stmt_info)
+{
+  stmt_info = vect_orig_stmt (stmt_info);
+  gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
+  if (!is_a <gphi *> (stmt_info->stmt))
+    stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
+  gphi *phi = as_a <gphi *> (stmt_info->stmt);
+  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
+    {
+      if (gimple_phi_num_args (phi) == 1)
+	stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
+    }
+  else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
+    {
+      edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
+      stmt_vec_info info
+	  = stmt_info->vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
+      if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
+	stmt_info = info;
+    }
+  return stmt_info;
+}
+
+/* Function vect_create_epilog_for_reduction
+
+   Create code at the loop-epilog to finalize the result of a reduction
    computation. 
   
-   VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector 
-     reduction statements. 
    STMT_INFO is the scalar reduction stmt that is being vectorized.
-   NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
-     number of elements that we can fit in a vectype (nunits).  In this case
-     we have to generate more than one vector stmt - i.e - we need to "unroll"
-     the vector stmt by a factor VF/nunits.  For more details see documentation
-     in vectorizable_operation.
-   REDUC_FN is the internal function for the epilog reduction.
-   REDUCTION_PHIS is a list of the phi-nodes that carry the reduction 
-     computation.
-   REDUC_INDEX is the index of the operand in the right hand side of the 
-     statement that is defined by REDUCTION_PHI.
-   DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
    SLP_NODE is an SLP node containing a group of reduction statements. The 
      first one in this group is STMT_INFO.
-   INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
-     when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
-     be smaller than any value of the IV in the loop, for MIN_EXPR larger than
-     any value of the IV in the loop.
-   INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
-   NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
-     null if this is not an SLP reduction
+   SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
+   REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
+     (counting from 0)
 
    This function:
-   1. Creates the reduction def-use cycles: sets the arguments for 
-      REDUCTION_PHIS:
-      The loop-entry argument is the vectorized initial-value of the reduction.
-      The loop-latch argument is taken from VECT_DEFS - the vector of partial 
-      sums.
+   1. Completes the reduction def-use cycles.
    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
       by calling the function specified by REDUC_FN if available, or by
       other means (whole-vector shifts or a scalar loop).
@@ -4354,7 +4326,7 @@ get_initial_defs_for_reduction (slp_tree slp_node,
      The flow at the entry to this function:
 
         loop:
-          vec_def = phi <null, null>            # REDUCTION_PHI
+          vec_def = phi <vec_init, null>        # REDUCTION_PHI
           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
           s_loop = scalar_stmt                  # (scalar) STMT_INFO
         loop_exit:
@@ -4379,21 +4351,34 @@ get_initial_defs_for_reduction (slp_tree slp_node,
 */
 
 static void
-vect_create_epilog_for_reduction (vec<tree> vect_defs,
-				  stmt_vec_info stmt_info,
-				  gimple *reduc_def_stmt,
-				  int ncopies, internal_fn reduc_fn,
-				  vec<stmt_vec_info> reduction_phis,
-                                  bool double_reduc, 
+vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
 				  slp_tree slp_node,
-				  slp_instance slp_node_instance,
-				  tree induc_val, enum tree_code induc_code,
-				  tree neutral_op)
+				  slp_instance slp_node_instance)
 {
+  stmt_vec_info reduc_info = info_for_reduction (stmt_info);
+  gcc_assert (reduc_info->is_reduc_info);
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  /* For double reductions we need to get at the inner loop reduction
+     stmt which has the meta info attached.  Our stmt_info is that of the
+     loop-closed PHI of the inner loop which we remember as
+     def for the reduction PHI generation.  */
+  bool double_reduc = false;
+  stmt_vec_info rdef_info = stmt_info;
+  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
+    {
+      gcc_assert (!slp_node);
+      double_reduc = true;
+      stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
+					    (stmt_info->stmt, 0));
+      stmt_info = vect_stmt_to_vectorize (stmt_info);
+    }
+  gphi *reduc_def_stmt
+    = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
+  enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
+  internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
   stmt_vec_info prev_phi_info;
   tree vectype;
   machine_mode mode;
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
   basic_block exit_bb;
   tree scalar_dest;
@@ -4401,32 +4386,24 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
   gimple *new_phi = NULL, *phi;
   stmt_vec_info phi_info;
   gimple_stmt_iterator exit_gsi;
-  tree vec_dest;
-  tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
+  tree new_temp = NULL_TREE, new_name, new_scalar_dest;
   gimple *epilog_stmt = NULL;
-  enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
   gimple *exit_phi;
   tree bitsize;
-  tree adjustment_def = NULL;
-  tree vec_initial_def = NULL;
-  tree expr, def, initial_def = NULL;
+  tree def;
   tree orig_name, scalar_result;
   imm_use_iterator imm_iter, phi_imm_iter;
   use_operand_p use_p, phi_use_p;
   gimple *use_stmt;
-  stmt_vec_info reduction_phi_info = NULL;
   bool nested_in_vect_loop = false;
   auto_vec<gimple *> new_phis;
-  auto_vec<stmt_vec_info> inner_phis;
   int j, i;
   auto_vec<tree> scalar_results;
-  unsigned int group_size = 1, k, ratio;
-  auto_vec<tree> vec_initial_defs;
+  unsigned int group_size = 1, k;
   auto_vec<gimple *> phis;
   bool slp_reduc = false;
   bool direct_slp_reduc;
   tree new_phi_result;
-  stmt_vec_info inner_phi = NULL;
   tree induction_index = NULL_TREE;
 
   if (slp_node)
@@ -4439,127 +4416,53 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
       nested_in_vect_loop = true;
       gcc_assert (!slp_node);
     }
+  gcc_assert (!nested_in_vect_loop || double_reduc);
 
-  vectype = STMT_VINFO_VECTYPE (stmt_info);
+  vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
   gcc_assert (vectype);
   mode = TYPE_MODE (vectype);
 
-  /* 1. Create the reduction def-use cycle:
-     Set the arguments of REDUCTION_PHIS, i.e., transform
-
-        loop:
-          vec_def = phi <null, null>            # REDUCTION_PHI
-          VECT_DEF = vector_stmt                # vectorized form of STMT
-          ...
-
-     into:
-
-        loop:
-          vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
-          VECT_DEF = vector_stmt                # vectorized form of STMT
-          ...
-
-     (in case of SLP, do it for all the phis). */
-
-  /* Get the loop-entry arguments.  */
-  enum vect_def_type initial_def_dt = vect_unknown_def_type;
+  tree initial_def = NULL;
+  tree induc_val = NULL_TREE;
+  tree adjustment_def = NULL;
   if (slp_node)
-    {
-      unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-      vec_initial_defs.reserve (vec_num);
-      get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
-				      &vec_initial_defs, vec_num,
-				      REDUC_GROUP_FIRST_ELEMENT (stmt_info),
-				      neutral_op);
-    }
+    ;
   else
     {
       /* Get at the scalar def before the loop, that defines the initial value
 	 of the reduction variable.  */
       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
 					   loop_preheader_edge (loop));
-      /* Optimize: if initial_def is for REDUC_MAX smaller than the base
-	 and we can't use zero for induc_val, use initial_def.  Similarly
-	 for REDUC_MIN and initial_def larger than the base.  */
-      if (TREE_CODE (initial_def) == INTEGER_CST
-	  && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
-	      == INTEGER_INDUC_COND_REDUCTION)
-	  && !integer_zerop (induc_val)
-	  && ((induc_code == MAX_EXPR
-	       && tree_int_cst_lt (initial_def, induc_val))
-	      || (induc_code == MIN_EXPR
-		  && tree_int_cst_lt (induc_val, initial_def))))
-	induc_val = initial_def;
-
-      if (double_reduc)
-	/* In case of double reduction we only create a vector variable
-	   to be put in the reduction phi node.  The actual statement
-	   creation is done later in this function.  */
-	vec_initial_def = vect_create_destination_var (initial_def, vectype);
+      /* Optimize: for induction condition reduction, if we can't use zero
+         for induc_val, use initial_def.  */
+      if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
+	induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
+      else if (double_reduc)
+	;
       else if (nested_in_vect_loop)
-	{
-	  /* Do not use an adjustment def as that case is not supported
-	     correctly if ncopies is not one.  */
-	  vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
-	  vec_initial_def = vect_get_vec_def_for_operand (initial_def,
-							  stmt_info);
-	}
+	;
       else
-	vec_initial_def
-	  = get_initial_def_for_reduction (stmt_info, initial_def,
-					   &adjustment_def);
-      vec_initial_defs.create (1);
-      vec_initial_defs.quick_push (vec_initial_def);
+	adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
     }
 
-  /* Set phi nodes arguments.  */
-  FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
+  unsigned vec_num;
+  int ncopies;
+  if (slp_node)
     {
-      tree vec_init_def = vec_initial_defs[i];
-      tree def = vect_defs[i];
-      for (j = 0; j < ncopies; j++)
-        {
-	  if (j != 0)
-	    {
-	      phi_info = STMT_VINFO_RELATED_STMT (phi_info);
-	      if (nested_in_vect_loop)
-		vec_init_def
-		  = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
-	    }
-
-	  /* Set the loop-entry arg of the reduction-phi.  */
-
-	  gphi *phi = as_a <gphi *> (phi_info->stmt);
-	  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
-	      == INTEGER_INDUC_COND_REDUCTION)
-	    {
-	      /* Initialise the reduction phi to zero.  This prevents initial
-		 values of non-zero interferring with the reduction op.  */
-	      gcc_assert (ncopies == 1);
-	      gcc_assert (i == 0);
-
-	      tree vec_init_def_type = TREE_TYPE (vec_init_def);
-	      tree induc_val_vec
-		= build_vector_from_val (vec_init_def_type, induc_val);
-
-	      add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
-			   UNKNOWN_LOCATION);
-	    }
-	  else
-	    add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
-			 UNKNOWN_LOCATION);
-
-          /* Set the loop-latch arg for the reduction-phi.  */
-          if (j > 0)
-	    def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
-
-	  add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
-
-          if (dump_enabled_p ())
-	    dump_printf_loc (MSG_NOTE, vect_location,
-			     "transform reduction: created def-use cycle: %G%G",
-			     phi, SSA_NAME_DEF_STMT (def));
-        }
+      vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
+      ncopies = 1;
+    }
+  else
+    {
+      vec_num = 1;
+      ncopies = 0;
+      phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
+      do
+	{
+	  ncopies++;
+	  phi_info = STMT_VINFO_RELATED_STMT (phi_info);
+	}
+      while (phi_info);
     }
 
   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
@@ -4569,7 +4472,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
      The first match will be a 1 to allow 0 to be used for non-matching
      indexes.  If there are no matches at all then the vector will be all
      zeroes.  */
-  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
+  if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
     {
       tree indx_before_incr, indx_after_incr;
       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
@@ -4627,11 +4530,17 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
 
       /* Create a conditional, where the condition is taken from vec_stmt
-	 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
-	 else is the phi (NEW_PHI_TREE).  */
-      tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
-				     ccompare, indx_before_incr,
-				     new_phi_tree);
+	 (CCOMPARE).  The then and else values mirror the main VEC_COND_EXPR:
+	 the reduction phi corresponds to NEW_PHI_TREE and the new values
+	 correspond to INDEX_BEFORE_INCR.  */
+      gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) >= 1);
+      tree index_cond_expr;
+      if (STMT_VINFO_REDUC_IDX (stmt_info) == 2)
+	index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
+				  ccompare, indx_before_incr, new_phi_tree);
+      else
+	index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
+				  ccompare, new_phi_tree, indx_before_incr);
       induction_index = make_ssa_name (cr_index_vector_type);
       gimple *index_condition = gimple_build_assign (induction_index,
 						     index_cond_expr);
@@ -4674,12 +4583,17 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
          v_out1 = phi <VECT_DEF> 
          Store them in NEW_PHIS.  */
-
+  if (double_reduc)
+    loop = outer_loop;
   exit_bb = single_exit (loop)->dest;
   prev_phi_info = NULL;
-  new_phis.create (vect_defs.length ());
-  FOR_EACH_VEC_ELT (vect_defs, i, def)
+  new_phis.create (slp_node ? vec_num : ncopies);
+  for (unsigned i = 0; i < vec_num; i++)
     {
+      if (slp_node)
+	def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
+      else
+	def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
       for (j = 0; j < ncopies; j++)
         {
 	  tree new_def = copy_ssa_name (def);
@@ -4698,37 +4612,6 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
         }
     }
 
-  /* The epilogue is created for the outer-loop, i.e., for the loop being
-     vectorized.  Create exit phis for the outer loop.  */
-  if (double_reduc)
-    {
-      loop = outer_loop;
-      exit_bb = single_exit (loop)->dest;
-      inner_phis.create (vect_defs.length ());
-      FOR_EACH_VEC_ELT (new_phis, i, phi)
-	{
-	  stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
-	  tree new_result = copy_ssa_name (PHI_RESULT (phi));
-	  gphi *outer_phi = create_phi_node (new_result, exit_bb);
-	  SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
-			   PHI_RESULT (phi));
-	  prev_phi_info = loop_vinfo->add_stmt (outer_phi);
-	  inner_phis.quick_push (phi_info);
-	  new_phis[i] = outer_phi;
-	  while (STMT_VINFO_RELATED_STMT (phi_info))
-            {
-	      phi_info = STMT_VINFO_RELATED_STMT (phi_info);
-	      new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
-	      outer_phi = create_phi_node (new_result, exit_bb);
-	      SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
-			       PHI_RESULT (phi_info->stmt));
-	      stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
-	      STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
-	      prev_phi_info = outer_phi_info;
-	    }
-	}
-    }
-
   exit_gsi = gsi_after_labels (exit_bb);
 
   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
@@ -4747,12 +4630,6 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
     }
-
-  code = gimple_assign_rhs_code (orig_stmt_info->stmt);
-  /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
-     partial results are added and not subtracted.  */
-  if (code == MINUS_EXPR) 
-    code = PLUS_EXPR;
   
   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
   scalar_type = TREE_TYPE (scalar_dest);
@@ -4760,15 +4637,6 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
   bitsize = TYPE_SIZE (scalar_type);
 
-  /* In case this is a reduction in an inner-loop while vectorizing an outer
-     loop - we don't need to extract a single scalar result at the end of the
-     inner-loop (unless it is double reduction, i.e., the use of reduction is
-     outside the outer-loop).  The final vector of partial results will be used
-     in the vectorized outer-loop, or reduced to a scalar result at the end of
-     the outer-loop.  */
-  if (nested_in_vect_loop && !double_reduc)
-    goto vect_finalize_reduction;
-
   /* SLP reduction without reduction chain, e.g.,
      # a1 = phi <a2, a0>
      # b1 = phi <b2, b0>
@@ -4791,53 +4659,48 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
      one vector.  */
   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
     {
+      gimple_seq stmts = NULL;
       tree first_vect = PHI_RESULT (new_phis[0]);
-      gassign *new_vec_stmt = NULL;
-      vec_dest = vect_create_destination_var (scalar_dest, vectype);
+      first_vect = gimple_convert (&stmts, vectype, first_vect);
       for (k = 1; k < new_phis.length (); k++)
         {
 	  gimple *next_phi = new_phis[k];
           tree second_vect = PHI_RESULT (next_phi);
-          tree tem = make_ssa_name (vec_dest, new_vec_stmt);
-          new_vec_stmt = gimple_build_assign (tem, code,
-					      first_vect, second_vect);
-          gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
-	  first_vect = tem;
+	  second_vect = gimple_convert (&stmts, vectype, second_vect);
+          first_vect = gimple_build (&stmts, code, vectype,
+				     first_vect, second_vect);
         }
+      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
 
       new_phi_result = first_vect;
-      if (new_vec_stmt)
-        {
-          new_phis.truncate (0);
-          new_phis.safe_push (new_vec_stmt);
-        }
+      new_phis.truncate (0);
+      new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
     }
   /* Likewise if we couldn't use a single defuse cycle.  */
   else if (ncopies > 1)
     {
       gcc_assert (new_phis.length () == 1);
+      gimple_seq stmts = NULL;
       tree first_vect = PHI_RESULT (new_phis[0]);
-      gassign *new_vec_stmt = NULL;
-      vec_dest = vect_create_destination_var (scalar_dest, vectype);
+      first_vect = gimple_convert (&stmts, vectype, first_vect);
       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
       for (int k = 1; k < ncopies; ++k)
 	{
 	  next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
 	  tree second_vect = PHI_RESULT (next_phi_info->stmt);
-          tree tem = make_ssa_name (vec_dest, new_vec_stmt);
-          new_vec_stmt = gimple_build_assign (tem, code,
-					      first_vect, second_vect);
-          gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
-	  first_vect = tem;
+	  second_vect = gimple_convert (&stmts, vectype, second_vect);
+	  first_vect = gimple_build (&stmts, code, vectype,
+				     first_vect, second_vect);
 	}
+      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
       new_phi_result = first_vect;
       new_phis.truncate (0);
-      new_phis.safe_push (new_vec_stmt);
+      new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
     }
   else
     new_phi_result = PHI_RESULT (new_phis[0]);
 
-  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
+  if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
       && reduc_fn != IFN_LAST)
     {
       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
@@ -4852,8 +4715,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
       tree index_vec_type = TREE_TYPE (induction_index);
       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
       tree index_scalar_type = TREE_TYPE (index_vec_type);
-      tree index_vec_cmp_type = build_same_sized_truth_vector_type
-	(index_vec_type);
+      tree index_vec_cmp_type = truth_type_for (index_vec_type);
 
       /* Get an unsigned integer version of the type of the data vector.  */
       int scalar_precision
@@ -4946,7 +4808,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
       scalar_results.safe_push (new_temp);
     }
-  else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
+  else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
 	   && reduc_fn == IFN_LAST)
     {
       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
@@ -4989,7 +4851,6 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
 	  if (off != 0)
 	    {
 	      tree new_idx_val = idx_val;
-	      tree new_val = val;
 	      if (off != v_size - el_size)
 		{
 		  new_idx_val = make_ssa_name (idx_eltype);
@@ -4998,7 +4859,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
 						     old_idx_val);
 		  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
 		}
-	      new_val = make_ssa_name (data_eltype);
+	      tree new_val = make_ssa_name (data_eltype);
 	      epilog_stmt = gimple_build_assign (new_val,
 						 COND_EXPR,
 						 build2 (GT_EXPR,
@@ -5060,9 +4921,8 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
       gimple_set_lhs (epilog_stmt, new_temp);
       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
 
-      if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
-	   == INTEGER_INDUC_COND_REDUCTION)
-	  && !operand_equal_p (initial_def, induc_val, 0))
+      if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
+	  && induc_val)
 	{
 	  /* Earlier we set the initial value to be a vector if induc_val
 	     values.  Check the result and if it is induc_val then replace
@@ -5100,7 +4960,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
       tree index = build_index_vector (vectype, 0, 1);
       tree index_type = TREE_TYPE (index);
       tree index_elt_type = TREE_TYPE (index_type);
-      tree mask_type = build_same_sized_truth_vector_type (index_type);
+      tree mask_type = truth_type_for (index_type);
 
       /* Create a vector that, for each element, identifies which of
 	 the REDUC_GROUP_SIZE results should use it.  */
@@ -5112,6 +4972,14 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
 	 scalar value if we have one, otherwise the initial scalar value
 	 is itself a neutral value.  */
       tree vector_identity = NULL_TREE;
+      tree neutral_op = NULL_TREE;
+      if (slp_node)
+	{
+	  stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
+	  neutral_op
+	    = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
+					    vectype, code, first != NULL);
+	}
       if (neutral_op)
 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
 							neutral_op);
@@ -5161,32 +5029,19 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
       bool reduce_with_shift;
       tree vec_temp;
 
-      /* COND reductions all do the final reduction with MAX_EXPR
-	 or MIN_EXPR.  */
-      if (code == COND_EXPR)
-	{
-	  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
-	      == INTEGER_INDUC_COND_REDUCTION)
-	    code = induc_code;
-	  else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
-		   == CONST_COND_REDUCTION)
-	    code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
-	  else
-	    code = MAX_EXPR;
-	}
-
       /* See if the target wants to do the final (shift) reduction
 	 in a vector mode of smaller size and first reduce upper/lower
 	 halves against each other.  */
       enum machine_mode mode1 = mode;
-      tree vectype1 = vectype;
-      unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
-      unsigned sz1 = sz;
+      tree stype = TREE_TYPE (vectype);
+      unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+      unsigned nunits1 = nunits;
       if (!slp_reduc
 	  && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
-	sz1 = GET_MODE_SIZE (mode1).to_constant ();
+	nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
 
-      vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
+      tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
+							   stype, nunits1);
       reduce_with_shift = have_whole_vector_shift (mode1);
       if (!VECTOR_MODE_P (mode1))
 	reduce_with_shift = false;
@@ -5200,11 +5055,13 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
       /* First reduce the vector to the desired vector size we should
 	 do shift reduction on by combining upper and lower halves.  */
       new_temp = new_phi_result;
-      while (sz > sz1)
+      while (nunits > nunits1)
 	{
 	  gcc_assert (!slp_reduc);
-	  sz /= 2;
-	  vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
+	  nunits /= 2;
+	  vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
+							  stype, nunits);
+	  unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
 
 	  /* The target has to make sure we support lowpart/highpart
 	     extraction, either via direct vector extract or through
@@ -5229,15 +5086,14 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
 		  = gimple_build_assign (dst2, BIT_FIELD_REF,
 					 build3 (BIT_FIELD_REF, vectype1,
 						 new_temp, TYPE_SIZE (vectype1),
-						 bitsize_int (sz * BITS_PER_UNIT)));
+						 bitsize_int (bitsize)));
 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
 	    }
 	  else
 	    {
 	      /* Extract via punning to appropriately sized integer mode
 		 vector.  */
-	      tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
-							    1);
+	      tree eltype = build_nonstandard_integer_type (bitsize, 1);
 	      tree etype = build_vector_type (eltype, 2);
 	      gcc_assert (convert_optab_handler (vec_extract_optab,
 						 TYPE_MODE (etype),
@@ -5266,7 +5122,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
 		  = gimple_build_assign (tem, BIT_FIELD_REF,
 					 build3 (BIT_FIELD_REF, eltype,
 						 new_temp, TYPE_SIZE (eltype),
-						 bitsize_int (sz * BITS_PER_UNIT)));
+						 bitsize_int (bitsize)));
 	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
 	      dst2 =  make_ssa_name (vectype1);
 	      epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
@@ -5307,8 +5163,8 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
             dump_printf_loc (MSG_NOTE, vect_location,
 			     "Reduce using vector shifts\n");
 
-	  mode1 = TYPE_MODE (vectype1);
-          vec_dest = vect_create_destination_var (scalar_dest, vectype1);
+	  gimple_seq stmts = NULL;
+	  new_temp = gimple_convert (&stmts, vectype1, new_temp);
           for (elt_offset = nelements / 2;
                elt_offset >= 1;
                elt_offset /= 2)
@@ -5316,18 +5172,12 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
 	      calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
 	      indices.new_vector (sel, 2, nelements);
 	      tree mask = vect_gen_perm_mask_any (vectype1, indices);
-	      epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
-						 new_temp, zero_vec, mask);
-              new_name = make_ssa_name (vec_dest, epilog_stmt);
-              gimple_assign_set_lhs (epilog_stmt, new_name);
-              gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-
-	      epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
-						 new_temp);
-              new_temp = make_ssa_name (vec_dest, epilog_stmt);
-              gimple_assign_set_lhs (epilog_stmt, new_temp);
-              gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+	      new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
+				       new_temp, zero_vec, mask);
+	      new_temp = gimple_build (&stmts, code,
+				       vectype1, new_name, new_temp);
             }
+	  gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
 
 	  /* 2.4  Extract the final scalar result.  Create:
 	     s_out3 = extract_field <v_out2, bitpos>  */
@@ -5439,9 +5289,8 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
             scalar_results.safe_push (new_temp);
         }
 
-      if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
-	   == INTEGER_INDUC_COND_REDUCTION)
-	  && !operand_equal_p (initial_def, induc_val, 0))
+      if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
+	  && induc_val)
 	{
 	  /* Earlier we set the initial value to be a vector if induc_val
 	     values.  Check the result and if it is induc_val then replace
@@ -5457,12 +5306,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs,
 	  scalar_results[0] = tmp;
 	}
     }
-  
-vect_finalize_reduction:
-
-  if (double_reduc)
-    loop = loop->inner;
-
+ 
   /* 2.5 Adjust the final result by the initial value of the reduction
 	 variable. (When such adjustment is not needed, then
 	 'adjustment_def' is zero).  For example, if code is PLUS we create:
@@ -5471,25 +5315,26 @@ vect_finalize_reduction:
   if (adjustment_def)
     {
       gcc_assert (!slp_reduc);
+      gimple_seq stmts = NULL;
       if (nested_in_vect_loop)
 	{
           new_phi = new_phis[0];
-	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
-	  expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
-	  new_dest = vect_create_destination_var (scalar_dest, vectype);
+	  gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
+	  adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
+	  new_temp = gimple_build (&stmts, code, vectype,
+				   PHI_RESULT (new_phi), adjustment_def);
 	}
       else
 	{
           new_temp = scalar_results[0];
 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
-	  expr = build2 (code, scalar_type, new_temp, adjustment_def);
-	  new_dest = vect_create_destination_var (scalar_dest, scalar_type);
+	  adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
+	  new_temp = gimple_build (&stmts, code, scalar_type,
+				   new_temp, adjustment_def);
 	}
 
-      epilog_stmt = gimple_build_assign (new_dest, expr);
-      new_temp = make_ssa_name (new_dest, epilog_stmt);
-      gimple_assign_set_lhs (epilog_stmt, new_temp);
-      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+      epilog_stmt = gimple_seq_last_stmt (stmts);
+      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
       if (nested_in_vect_loop)
         {
 	  stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
@@ -5507,6 +5352,9 @@ vect_finalize_reduction:
       new_phis[0] = epilog_stmt;
     }
 
+  if (double_reduc)
+    loop = loop->inner;
+
   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
           phis with new adjusted scalar results, i.e., replace use <s_out0>
           with use <s_out4>.        
@@ -5552,24 +5400,10 @@ vect_finalize_reduction:
      correspond to the first vector stmt, etc.
      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
   if (group_size > new_phis.length ())
-    {
-      ratio = group_size / new_phis.length ();
-      gcc_assert (!(group_size % new_phis.length ()));
-    }
-  else
-    ratio = 1;
+    gcc_assert (!(group_size % new_phis.length ()));
 
-  stmt_vec_info epilog_stmt_info = NULL;
   for (k = 0; k < group_size; k++)
     {
-      if (k % ratio == 0)
-        {
-	  epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
-	  reduction_phi_info = reduction_phis[k / ratio];
-	  if (double_reduc)
-	    inner_phi = inner_phis[k / ratio];
-        }
-
       if (slp_reduc)
         {
 	  stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
@@ -5580,121 +5414,12 @@ vect_finalize_reduction:
 	  scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
         }
 
-      phis.create (3);
-      /* Find the loop-closed-use at the loop exit of the original scalar
-         result.  (The reduction result is expected to have two immediate uses -
-         one at the latch block, and one at the loop exit).  */
-      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
-        if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
-	    && !is_gimple_debug (USE_STMT (use_p)))
-          phis.safe_push (USE_STMT (use_p));
-
-      /* While we expect to have found an exit_phi because of loop-closed-ssa
-         form we can end up without one if the scalar cycle is dead.  */
-
-      FOR_EACH_VEC_ELT (phis, i, exit_phi)
-        {
-          if (outer_loop)
-            {
-	      stmt_vec_info exit_phi_vinfo
-		= loop_vinfo->lookup_stmt (exit_phi);
-              gphi *vect_phi;
-
-	      if (double_reduc)
-		STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
-	      else
-		STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
-              if (!double_reduc
-                  || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
-                      != vect_double_reduction_def)
-                continue;
-
-              /* Handle double reduction:
-
-                 stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
-                 stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
-                 stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
-                 stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
-
-                 At that point the regular reduction (stmt2 and stmt3) is
-                 already vectorized, as well as the exit phi node, stmt4.
-                 Here we vectorize the phi node of double reduction, stmt1, and
-                 update all relevant statements.  */
-
-              /* Go through all the uses of s2 to find double reduction phi
-                 node, i.e., stmt1 above.  */
-              orig_name = PHI_RESULT (exit_phi);
-              FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
-                {
-                  stmt_vec_info use_stmt_vinfo;
-                  tree vect_phi_init, preheader_arg, vect_phi_res;
-                  basic_block bb = gimple_bb (use_stmt);
-
-                  /* Check that USE_STMT is really double reduction phi
-                     node.  */
-                  if (gimple_code (use_stmt) != GIMPLE_PHI
-                      || gimple_phi_num_args (use_stmt) != 2
-                      || bb->loop_father != outer_loop)
-                    continue;
-		  use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
-                  if (!use_stmt_vinfo
-                      || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
-                          != vect_double_reduction_def)
-		    continue;
-
-                  /* Create vector phi node for double reduction:
-                     vs1 = phi <vs0, vs2>
-                     vs1 was created previously in this function by a call to
-                       vect_get_vec_def_for_operand and is stored in
-                       vec_initial_def;
-                     vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
-                     vs0 is created here.  */
-
-                  /* Create vector phi node.  */
-                  vect_phi = create_phi_node (vec_initial_def, bb);
-		  loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
-
-                  /* Create vs0 - initial def of the double reduction phi.  */
-                  preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
-                                             loop_preheader_edge (outer_loop));
-                  vect_phi_init = get_initial_def_for_reduction
-		    (stmt_info, preheader_arg, NULL);
-
-                  /* Update phi node arguments with vs0 and vs2.  */
-                  add_phi_arg (vect_phi, vect_phi_init,
-                               loop_preheader_edge (outer_loop),
-                               UNKNOWN_LOCATION);
-		  add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
-			       loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
-                  if (dump_enabled_p ())
-		    dump_printf_loc (MSG_NOTE, vect_location,
-				     "created double reduction phi node: %G",
-				     vect_phi);
-
-                  vect_phi_res = PHI_RESULT (vect_phi);
-
-                  /* Replace the use, i.e., set the correct vs1 in the regular
-                     reduction phi node.  FORNOW, NCOPIES is always 1, so the
-                     loop is redundant.  */
-		  stmt_vec_info use_info = reduction_phi_info;
-		  for (j = 0; j < ncopies; j++)
-		    {
-		      edge pr_edge = loop_preheader_edge (loop);
-		      SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
-				       pr_edge->dest_idx, vect_phi_res);
-		      use_info = STMT_VINFO_RELATED_STMT (use_info);
-		    }
-                }
-            }
-        }
-
-      phis.release ();
       if (nested_in_vect_loop)
         {
           if (double_reduc)
             loop = outer_loop;
           else
-            continue;
+	    gcc_unreachable ();
         }
 
       phis.create (3);
@@ -5824,9 +5549,6 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
   gcc_assert (ncopies == 1);
   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
-  gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
-  gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
-	      == FOLD_LEFT_REDUCTION);
 
   if (slp_node)
     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
@@ -5840,10 +5562,7 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
   if (slp_node)
     {
       auto_vec<vec<tree> > vec_defs (2);
-      auto_vec<tree> sops(2);
-      sops.quick_push (ops[0]);
-      sops.quick_push (ops[1]);
-      vect_get_slp_defs (sops, slp_node, &vec_defs);
+      vect_get_slp_defs (slp_node, &vec_defs);
       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
       vec_defs[0].release ();
       vec_defs[1].release ();
@@ -5984,6 +5703,55 @@ is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
 	  <= TYPE_PRECISION (lhs_type));
 }
 
+/* Check if masking can be supported by inserting a conditional expression.
+   CODE is the code for the operation.  COND_FN is the conditional internal
+   function, if it exists.  VECTYPE_IN is the type of the vector input.  */
+static bool
+use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
+			 tree vectype_in)
+{
+  if (cond_fn != IFN_LAST
+      && direct_internal_fn_supported_p (cond_fn, vectype_in,
+					 OPTIMIZE_FOR_SPEED))
+    return false;
+
+  switch (code)
+    {
+    case DOT_PROD_EXPR:
+      return true;
+
+    default:
+      return false;
+    }
+}
+
+/* Insert a conditional expression to enable masked vectorization.  CODE is the
+   code for the operation.  VOP is the array of operands.  MASK is the loop
+   mask.  GSI is a statement iterator used to place the new conditional
+   expression.  */
+static void
+build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
+		      gimple_stmt_iterator *gsi)
+{
+  switch (code)
+    {
+    case DOT_PROD_EXPR:
+      {
+	tree vectype = TREE_TYPE (vop[1]);
+	tree zero = build_zero_cst (vectype);
+	tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
+	gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
+					       mask, vop[1], zero);
+	gsi_insert_before (gsi, select, GSI_SAME_STMT);
+	vop[1] = masked_op1;
+	break;
+      }
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
 /* Function vectorizable_reduction.
 
    Check if STMT_INFO performs a reduction operation that can be vectorized.
@@ -6027,182 +5795,163 @@ is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
    corresponds to the type of arguments to the reduction stmt, and should *NOT*
    be used to create the vectorized stmt.  The right vectype for the vectorized
    stmt is obtained from the type of the result X:
-        get_vectype_for_scalar_type (TREE_TYPE (X))
+      get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
 
    This means that, contrary to "regular" reductions (or "regular" stmts in
    general), the following equation:
-      STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
+      STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
    does *NOT* necessarily hold for reduction patterns.  */
 
 bool
-vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
-			stmt_vec_info *vec_stmt, slp_tree slp_node,
+vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
 			slp_instance slp_node_instance,
 			stmt_vector_for_cost *cost_vec)
 {
-  tree vec_dest;
   tree scalar_dest;
-  tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
   tree vectype_in = NULL_TREE;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-  enum tree_code code, orig_code;
-  internal_fn reduc_fn;
-  machine_mode vec_mode;
-  int op_type;
-  optab optab;
-  tree new_temp = NULL_TREE;
-  enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
+  enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
   stmt_vec_info cond_stmt_vinfo = NULL;
-  enum tree_code cond_reduc_op_code = ERROR_MARK;
   tree scalar_type;
-  bool is_simple_use;
   int i;
   int ncopies;
-  int epilog_copies;
-  stmt_vec_info prev_stmt_info, prev_phi_info;
   bool single_defuse_cycle = false;
-  stmt_vec_info new_stmt_info = NULL;
-  int j;
-  tree ops[3];
-  enum vect_def_type dts[3];
-  bool nested_cycle = false, found_nested_cycle_def = false;
+  bool nested_cycle = false;
   bool double_reduc = false;
-  basic_block def_bb;
-  struct loop * def_stmt_loop;
-  tree def_arg;
-  auto_vec<tree> vec_oprnds0;
-  auto_vec<tree> vec_oprnds1;
-  auto_vec<tree> vec_oprnds2;
-  auto_vec<tree> vect_defs;
-  auto_vec<stmt_vec_info> phis;
   int vec_num;
-  tree def0, tem;
+  tree tem;
   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
   tree cond_reduc_val = NULL_TREE;
 
   /* Make sure it was already recognized as a reduction computation.  */
   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
+      && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
     return false;
 
-  if (nested_in_vect_loop_p (loop, stmt_info))
+  /* The stmt we store reduction analysis meta on.  */
+  stmt_vec_info reduc_info = info_for_reduction (stmt_info);
+  reduc_info->is_reduc_info = true;
+
+  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
     {
-      loop = loop->inner;
-      nested_cycle = true;
+      if (is_a <gphi *> (stmt_info->stmt))
+	{
+	  /* Analysis for double-reduction is done on the outer
+	     loop PHI, nested cycles have no further restrictions.  */
+	  STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
+	  /* For nested cycles we want to let regular vectorizable_*
+	     routines handle code-generation.  */
+	  if (STMT_VINFO_DEF_TYPE (reduc_info) != vect_double_reduction_def)
+	    {
+	      stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
+	      STMT_VINFO_DEF_TYPE (stmt_info) = vect_internal_def;
+	      STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (stmt_info))
+		= vect_internal_def;
+	    }
+	}
+      else
+	STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
+      return true;
     }
 
-  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
-    gcc_assert (slp_node
-		&& REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
-
-  if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
+  stmt_vec_info orig_stmt_of_analysis = stmt_info;
+  stmt_vec_info phi_info = stmt_info;
+  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
+      || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
     {
-      tree phi_result = gimple_phi_result (phi);
-      /* Analysis is fully done on the reduction stmt invocation.  */
-      if (! vec_stmt)
+      if (!is_a <gphi *> (stmt_info->stmt))
 	{
-	  if (slp_node)
-	    slp_node_instance->reduc_phis = slp_node;
-
 	  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
 	  return true;
 	}
-
-      if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
-	/* Leave the scalar phi in place.  Note that checking
-	   STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
-	   for reductions involving a single statement.  */
-	return true;
-
-      stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
-      reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
-
-      if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
-	  == EXTRACT_LAST_REDUCTION)
-	/* Leave the scalar phi in place.  */
-	return true;
-
-      gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
-      code = gimple_assign_rhs_code (reduc_stmt);
-      for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
+      if (slp_node)
 	{
-	  tree op = gimple_op (reduc_stmt, k);
-	  if (op == phi_result)
-	    continue;
-	  if (k == 1 && code == COND_EXPR)
-	    continue;
-	  bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
-	  gcc_assert (is_simple_use);
-	  if (dt == vect_constant_def || dt == vect_external_def)
-	    continue;
-	  if (!vectype_in
-	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
-		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
-	    vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
-	  break;
+	  slp_node_instance->reduc_phis = slp_node;
+	  /* ???  We're leaving slp_node to point to the PHIs, we only
+	     need it to get at the number of vector stmts which wasn't
+	     yet initialized for the instance root.  */
 	}
-      /* For a nested cycle we might end up with an operation like
-         phi_result * phi_result.  */
-      if (!vectype_in)
-	vectype_in = STMT_VINFO_VECTYPE (stmt_info);
-      gcc_assert (vectype_in);
-
-      if (slp_node)
-	ncopies = 1;
-      else
-	ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
-
-      stmt_vec_info use_stmt_info;
-      if (ncopies > 1
-	  && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
-	  && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
-	  && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
-	single_defuse_cycle = true;
-
-      /* Create the destination vector  */
-      scalar_dest = gimple_assign_lhs (reduc_stmt);
-      vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
-
-      if (slp_node)
-	/* The size vect_schedule_slp_instance computes is off for us.  */
-	vec_num = vect_get_num_vectors
-	  (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
-	   * SLP_TREE_SCALAR_STMTS (slp_node).length (),
-	   vectype_in);
-      else
-	vec_num = 1;
+      if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
+	stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
+      else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
+	{
+	  use_operand_p use_p;
+	  gimple *use_stmt;
+	  bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
+				     &use_p, &use_stmt);
+	  gcc_assert (res);
+	  phi_info = loop_vinfo->lookup_stmt (use_stmt);
+	  stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
+	}
+    }
 
-      /* Generate the reduction PHIs upfront.  */
-      prev_phi_info = NULL;
-      for (j = 0; j < ncopies; j++)
+  /* PHIs should not participate in patterns.  */
+  gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
+  gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
+
+  /* Verify following REDUC_IDX from the latch def leads us back to the PHI
+     and compute the reduction chain length.  */
+  tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
+					  loop_latch_edge (loop));
+  unsigned reduc_chain_length = 0;
+  bool only_slp_reduc_chain = true;
+  stmt_info = NULL;
+  while (reduc_def != PHI_RESULT (reduc_def_phi))
+    {
+      stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
+      stmt_vec_info vdef = vect_stmt_to_vectorize (def);
+      if (STMT_VINFO_REDUC_IDX (vdef) == -1)
 	{
-	  if (j == 0 || !single_defuse_cycle)
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "reduction chain broken by patterns.\n");
+	  return false;
+	}
+      if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
+	only_slp_reduc_chain = false;
+      /* ???  For epilogue generation live members of the chain need
+         to point back to the PHI via their original stmt for
+	 info_for_reduction to work.  */
+      if (STMT_VINFO_LIVE_P (vdef))
+	STMT_VINFO_REDUC_DEF (def) = phi_info;
+      if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (vdef->stmt)))
+	{
+	  if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (vdef->stmt)),
+				      TREE_TYPE (gimple_assign_rhs1 (vdef->stmt))))
 	    {
-	      for (i = 0; i < vec_num; i++)
-		{
-		  /* Create the reduction-phi that defines the reduction
-		     operand.  */
-		  gimple *new_phi = create_phi_node (vec_dest, loop->header);
-		  stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
-
-		  if (slp_node)
-		    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
-		  else
-		    {
-		      if (j == 0)
-			STMT_VINFO_VEC_STMT (stmt_info)
-			  = *vec_stmt = new_phi_info;
-		      else
-			STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
-		      prev_phi_info = new_phi_info;
-		    }
-		}
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "conversion in the reduction chain.\n");
+	      return false;
 	    }
 	}
+      else if (!stmt_info)
+	/* First non-conversion stmt.  */
+	stmt_info = vdef;
+      reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
+      reduc_chain_length++;
+    }
+  /* PHIs should not participate in patterns.  */
+  gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
 
-      return true;
+  if (nested_in_vect_loop_p (loop, stmt_info))
+    {
+      loop = loop->inner;
+      nested_cycle = true;
+    }
+
+  /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
+     element.  */
+  if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
+    {
+      gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
+      stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
     }
+  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
+    gcc_assert (slp_node
+		&& REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
 
   /* 1. Is vectorizable reduction?  */
   /* Not supportable if the reduction variable is used in the loop, unless
@@ -6235,37 +5984,13 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
         inside the loop body. The last operand is the reduction variable,
         which is defined by the loop-header-phi.  */
 
+  tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
+  STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
-
-  /* Flatten RHS.  */
-  switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
-    {
-    case GIMPLE_BINARY_RHS:
-      code = gimple_assign_rhs_code (stmt);
-      op_type = TREE_CODE_LENGTH (code);
-      gcc_assert (op_type == binary_op);
-      ops[0] = gimple_assign_rhs1 (stmt);
-      ops[1] = gimple_assign_rhs2 (stmt);
-      break;
-
-    case GIMPLE_TERNARY_RHS:
-      code = gimple_assign_rhs_code (stmt);
-      op_type = TREE_CODE_LENGTH (code);
-      gcc_assert (op_type == ternary_op);
-      ops[0] = gimple_assign_rhs1 (stmt);
-      ops[1] = gimple_assign_rhs2 (stmt);
-      ops[2] = gimple_assign_rhs3 (stmt);
-      break;
-
-    case GIMPLE_UNARY_RHS:
-      return false;
-
-    default:
-      gcc_unreachable ();
-    }
-
-  if (code == COND_EXPR && slp_node)
-    return false;
+  enum tree_code code = gimple_assign_rhs_code (stmt);
+  bool lane_reduc_code_p
+    = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
+  int op_type = TREE_CODE_LENGTH (code);
 
   scalar_dest = gimple_assign_lhs (stmt);
   scalar_type = TREE_TYPE (scalar_dest);
@@ -6277,67 +6002,65 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   if (!type_has_mode_precision_p (scalar_type))
     return false;
 
+  /* For lane-reducing ops we're reducing the number of reduction PHIs
+     which means the only use of that may be in the lane-reducing operation.  */
+  if (lane_reduc_code_p
+      && reduc_chain_length != 1
+      && !only_slp_reduc_chain)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "lane-reducing reduction with extra stmts.\n");
+      return false;
+    }
+
   /* All uses but the last are expected to be defined in the loop.
      The last use is the reduction variable.  In case of nested cycle this
      assumption is not true: we use reduc_index to record the index of the
      reduction variable.  */
-  stmt_vec_info reduc_def_info;
-  if (orig_stmt_info)
-    reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
-  else
-    reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
-  gcc_assert (reduc_def_info);
-  gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
-  tree reduc_def = PHI_RESULT (reduc_def_phi);
-  int reduc_index = -1;
+  reduc_def = PHI_RESULT (reduc_def_phi);
   for (i = 0; i < op_type; i++)
     {
+      tree op = gimple_op (stmt, i + 1);
       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
       if (i == 0 && code == COND_EXPR)
         continue;
 
       stmt_vec_info def_stmt_info;
-      is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
-					  &def_stmt_info);
-      dt = dts[i];
-      gcc_assert (is_simple_use);
-      if (dt == vect_reduction_def
-	  && ops[i] == reduc_def)
-	{
-	  reduc_index = i;
-	  continue;
-	}
-      else if (tem)
+      enum vect_def_type dt;
+      if (!vect_is_simple_use (op, loop_vinfo, &dt, &tem,
+			       &def_stmt_info))
 	{
-	  /* To properly compute ncopies we are interested in the widest
-	     input type in case we're looking at a widening accumulation.  */
-	  if (!vectype_in
-	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
-		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
-	    vectype_in = tem;
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "use not simple.\n");
+	  return false;
 	}
+      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
+	continue;
 
-      if (dt != vect_internal_def
-	  && dt != vect_external_def
-	  && dt != vect_constant_def
-	  && dt != vect_induction_def
-          && !(dt == vect_nested_cycle && nested_cycle))
+      /* There should be only one cycle def in the stmt, the one
+         leading to reduc_def.  */
+      if (VECTORIZABLE_CYCLE_DEF (dt))
 	return false;
 
-      if (dt == vect_nested_cycle
-	  && ops[i] == reduc_def)
-	{
-	  found_nested_cycle_def = true;
-	  reduc_index = i;
-	}
+      /* To properly compute ncopies we are interested in the widest
+	 non-reduction input type in case we're looking at a widening
+	 accumulation that we later handle in vect_transform_reduction.  */
+      if (lane_reduc_code_p
+	  && tem
+	  && (!vectype_in
+	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
+	vectype_in = tem;
 
-      if (i == 1 && code == COND_EXPR)
+      if (code == COND_EXPR)
 	{
-	  /* Record how value of COND_EXPR is defined.  */
+	  /* Record how the non-reduction-def value of COND_EXPR is defined.  */
 	  if (dt == vect_constant_def)
 	    {
 	      cond_reduc_dt = dt;
-	      cond_reduc_val = ops[i];
+	      cond_reduc_val = op;
 	    }
 	  if (dt == vect_induction_def
 	      && def_stmt_info
@@ -6348,93 +6071,35 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	    }
 	}
     }
-
   if (!vectype_in)
-    vectype_in = vectype_out;
-
-  /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
-     directy used in stmt.  */
-  if (reduc_index == -1)
-    {
-      if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
-	{
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "in-order reduction chain without SLP.\n");
-	  return false;
-	}
-    }
-
-  if (!(reduc_index == -1
-	|| dts[reduc_index] == vect_reduction_def
-	|| dts[reduc_index] == vect_nested_cycle
-	|| ((dts[reduc_index] == vect_internal_def
-	     || dts[reduc_index] == vect_external_def
-	     || dts[reduc_index] == vect_constant_def
-	     || dts[reduc_index] == vect_induction_def)
-	    && nested_cycle && found_nested_cycle_def)))
-    {
-      /* For pattern recognized stmts, orig_stmt might be a reduction,
-	 but some helper statements for the pattern might not, or
-	 might be COND_EXPRs with reduction uses in the condition.  */
-      gcc_assert (orig_stmt_info);
-      return false;
-    }
-
-  /* PHIs should not participate in patterns.  */
-  gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
-  enum vect_reduction_type v_reduc_type
-    = STMT_VINFO_REDUC_TYPE (reduc_def_info);
-  stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
+    vectype_in = STMT_VINFO_VECTYPE (phi_info);
+  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
 
-  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
+  enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
+  STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
   /* If we have a condition reduction, see if we can simplify it further.  */
   if (v_reduc_type == COND_REDUCTION)
     {
-      /* TODO: We can't yet handle reduction chains, since we need to treat
-	 each COND_EXPR in the chain specially, not just the last one.
-	 E.g. for:
-
-	    x_1 = PHI <x_3, ...>
-	    x_2 = a_2 ? ... : x_1;
-	    x_3 = a_3 ? ... : x_2;
+      if (slp_node)
+	return false;
 
-	 we're interested in the last element in x_3 for which a_2 || a_3
-	 is true, whereas the current reduction chain handling would
-	 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
-	 as a reduction operation.  */
-      if (reduc_index == -1)
+      /* When the condition uses the reduction value in the condition, fail.  */
+      if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "conditional reduction chains not supported\n");
+			     "condition depends on previous iteration\n");
 	  return false;
 	}
 
-      /* vect_is_simple_reduction ensured that operand 2 is the
-	 loop-carried operand.  */
-      gcc_assert (reduc_index == 2);
-
-      /* Loop peeling modifies initial value of reduction PHI, which
-	 makes the reduction stmt to be transformed different to the
-	 original stmt analyzed.  We need to record reduction code for
-	 CONST_COND_REDUCTION type reduction at analyzing stage, thus
-	 it can be used directly at transform stage.  */
-      if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
-	  || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
-	{
-	  /* Also set the reduction type to CONST_COND_REDUCTION.  */
-	  gcc_assert (cond_reduc_dt == vect_constant_def);
-	  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
-	}
-      else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
-					       vectype_in, OPTIMIZE_FOR_SPEED))
+      if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
+					  vectype_in, OPTIMIZE_FOR_SPEED))
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			     "optimizing condition reduction with"
 			     " FOLD_EXTRACT_LAST.\n");
-	  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
+	  STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
 	}
       else if (cond_reduc_dt == vect_induction_def)
 	{
@@ -6445,6 +6110,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	  gcc_assert (TREE_CODE (base) == INTEGER_CST
 		      && TREE_CODE (step) == INTEGER_CST);
 	  cond_reduc_val = NULL_TREE;
+	  enum tree_code cond_reduc_op_code = ERROR_MARK;
 	  tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
 	  if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
 	    ;
@@ -6477,16 +6143,17 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 		dump_printf_loc (MSG_NOTE, vect_location,
 				 "condition expression based on "
 				 "integer induction.\n");
-	      STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
-		= INTEGER_INDUC_COND_REDUCTION;
+	      STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
+	      STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
+		= cond_reduc_val;
+	      STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
 	    }
 	}
       else if (cond_reduc_dt == vect_constant_def)
 	{
 	  enum vect_def_type cond_initial_dt;
-	  gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
 	  tree cond_initial_val
-	    = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
+	    = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
 
 	  gcc_assert (cond_reduc_val != NULL_TREE);
 	  vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
@@ -6503,25 +6170,15 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 				     "condition expression based on "
 				     "compile time constant.\n");
 		  /* Record reduction code at analysis stage.  */
-		  STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
+		  STMT_VINFO_REDUC_CODE (reduc_info)
 		    = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
-		  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
-		    = CONST_COND_REDUCTION;
+		  STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
 		}
 	    }
 	}
     }
 
-  if (orig_stmt_info)
-    gcc_assert (tmp == orig_stmt_info
-		|| REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
-  else
-    /* We changed STMT to be the first stmt in reduction chain, hence we
-       check that in this case the first element in the chain is STMT.  */
-    gcc_assert (tmp == stmt_info
-		|| REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
-
-  if (STMT_VINFO_LIVE_P (reduc_def_info))
+  if (STMT_VINFO_LIVE_P (phi_info))
     return false;
 
   if (slp_node)
@@ -6531,102 +6188,13 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 
   gcc_assert (ncopies >= 1);
 
-  vec_mode = TYPE_MODE (vectype_in);
   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
 
   if (nested_cycle)
     {
-      def_bb = gimple_bb (reduc_def_phi);
-      def_stmt_loop = def_bb->loop_father;
-      def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
-                                       loop_preheader_edge (def_stmt_loop));
-      stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
-      if (def_arg_stmt_info
-	  && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
-	      == vect_double_reduction_def))
-        double_reduc = true;
-    }
-
-  vect_reduction_type reduction_type
-    = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
-  if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
-      && ncopies > 1)
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "multiple types in double reduction or condition "
-			 "reduction.\n");
-      return false;
-    }
-
-  if (code == COND_EXPR)
-    {
-      /* Only call during the analysis stage, otherwise we'll lose
-	 STMT_VINFO_TYPE.  */
-      if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
-						true, NULL, cost_vec))
-        {
-          if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "unsupported condition in reduction\n");
-	  return false;
-        }
-    }
-  else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
-	   || code == LROTATE_EXPR || code == RROTATE_EXPR)
-    {
-      /* Only call during the analysis stage, otherwise we'll lose
-	 STMT_VINFO_TYPE.  We only support this for nested cycles
-	 without double reductions at the moment.  */
-      if (!nested_cycle
-	  || double_reduc
-	  || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
-						NULL, cost_vec)))
-	{
-          if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "unsupported shift or rotation in reduction\n");
-	  return false;
-	}
-    }
-  else
-    {
-      /* 4. Supportable by target?  */
-
-      /* 4.1. check support for the operation in the loop  */
-      optab = optab_for_tree_code (code, vectype_in, optab_default);
-      if (!optab)
-        {
-          if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "no optab.\n");
-
-          return false;
-        }
-
-      if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
-        {
-          if (dump_enabled_p ())
-            dump_printf (MSG_NOTE, "op not supported by target.\n");
-
-	  if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
-	      || !vect_worthwhile_without_simd_p (loop_vinfo, code))
-            return false;
-
-          if (dump_enabled_p ())
-  	    dump_printf (MSG_NOTE, "proceeding using word mode.\n");
-        }
-
-      /* Worthwhile without SIMD support?  */
-      if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
-	  && !vect_worthwhile_without_simd_p (loop_vinfo, code))
-        {
-          if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "not worthwhile without SIMD support.\n");
-
-          return false;
-        }
+      gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
+		  == vect_double_reduction_def);
+      double_reduc = true;
     }
 
   /* 4.2. Check support for the epilog operation.
@@ -6664,38 +6232,55 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
           (and also the same tree-code) when generating the epilog code and
           when generating the code inside the loop.  */
 
-  if (orig_stmt_info
-      && (reduction_type == TREE_CODE_REDUCTION
-	  || reduction_type == FOLD_LEFT_REDUCTION))
-    {
-      /* This is a reduction pattern: get the vectype from the type of the
-         reduction variable, and get the tree-code from orig_stmt.  */
-      orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
-      gcc_assert (vectype_out);
-      vec_mode = TYPE_MODE (vectype_out);
-    }
-  else
-    {
-      /* Regular reduction: use the same vectype and tree-code as used for
-         the vector code inside the loop can be used for the epilog code. */
-      orig_code = code;
-
-      if (code == MINUS_EXPR)
-	orig_code = PLUS_EXPR;
+  enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
+  STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
 
-      /* For simple condition reductions, replace with the actual expression
-	 we want to base our reduction around.  */
-      if (reduction_type == CONST_COND_REDUCTION)
+  vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
+  if (reduction_type == TREE_CODE_REDUCTION)
+    {
+      /* Check whether it's ok to change the order of the computation.
+	 Generally, when vectorizing a reduction we change the order of the
+	 computation.  This may change the behavior of the program in some
+	 cases, so we need to check that this is ok.  One exception is when
+	 vectorizing an outer-loop: the inner-loop is executed sequentially,
+	 and therefore vectorizing reductions in the inner-loop during
+	 outer-loop vectorization is safe.  */
+      if (needs_fold_left_reduction_p (scalar_type, orig_code))
+	{
+	  /* When vectorizing a reduction chain w/o SLP the reduction PHI
+	     is not directy used in stmt.  */
+	  if (!only_slp_reduc_chain
+	      && reduc_chain_length != 1)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "in-order reduction chain without SLP.\n");
+	      return false;
+	    }
+	  STMT_VINFO_REDUC_TYPE (reduc_info)
+	    = reduction_type = FOLD_LEFT_REDUCTION;
+	}
+      else if (!commutative_tree_code (orig_code)
+	       || !associative_tree_code (orig_code))
 	{
-	  orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
-	  gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			    "reduction: not commutative/associative");
+	  return false;
 	}
-      else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
-	orig_code = cond_reduc_op_code;
     }
 
-  reduc_fn = IFN_LAST;
+  if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
+      && ncopies > 1)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "multiple types in double reduction or condition "
+			 "reduction or fold-left reduction.\n");
+      return false;
+    }
 
+  internal_fn reduc_fn = IFN_LAST;
   if (reduction_type == TREE_CODE_REDUCTION
       || reduction_type == FOLD_LEFT_REDUCTION
       || reduction_type == INTEGER_INDUC_COND_REDUCTION
@@ -6740,6 +6325,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 					  OPTIMIZE_FOR_SPEED))
 	reduc_fn = IFN_REDUC_MAX;
     }
+  STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
 
   if (reduction_type != EXTRACT_LAST_REDUCTION
       && (!nested_cycle || double_reduc)
@@ -6757,7 +6343,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   tree neutral_op = NULL_TREE;
   if (slp_node)
     neutral_op = neutral_op_for_slp_reduction
-      (slp_node_instance->reduc_phis, code,
+      (slp_node_instance->reduc_phis, vectype_out, orig_code,
        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
 
   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
@@ -6822,10 +6408,11 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	 which each SLP statement has its own initial value and in which
 	 that value needs to be repeated for every instance of the
 	 statement within the initial vector.  */
-      unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
+      unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
       if (!neutral_op
-	  && !can_duplicate_and_interleave_p (group_size, elt_mode))
+	  && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
+					      elt_mode))
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -6848,26 +6435,6 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	}
     }
 
-  /* In case of widenning multiplication by a constant, we update the type
-     of the constant to be the type of the other operand.  We check that the
-     constant fits the type in the pattern recognition pass.  */
-  if (code == DOT_PROD_EXPR
-      && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
-    {
-      if (TREE_CODE (ops[0]) == INTEGER_CST)
-        ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
-      else if (TREE_CODE (ops[1]) == INTEGER_CST)
-        ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
-      else
-        {
-          if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "invalid types in dot-prod\n");
-
-          return false;
-        }
-    }
-
   if (reduction_type == COND_REDUCTION)
     {
       widest_int ni;
@@ -6925,26 +6492,68 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
    This only works when we see both the reduction PHI and its only consumer
    in vectorizable_reduction and there are no intermediate stmts
    participating.  */
-  stmt_vec_info use_stmt_info;
-  tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
   if (ncopies > 1
       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
-      && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
-      && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
+      && reduc_chain_length == 1)
+    single_defuse_cycle = true;
+
+  if (single_defuse_cycle || lane_reduc_code_p)
     {
-      single_defuse_cycle = true;
-      epilog_copies = 1;
+      gcc_assert (code != COND_EXPR);
+
+      /* 4. Supportable by target?  */
+      bool ok = true;
+
+      /* 4.1. check support for the operation in the loop  */
+      optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
+      if (!optab)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "no optab.\n");
+	  ok = false;
+        }
+
+      machine_mode vec_mode = TYPE_MODE (vectype_in);
+      if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
+        {
+          if (dump_enabled_p ())
+            dump_printf (MSG_NOTE, "op not supported by target.\n");
+	  if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
+	      || !vect_worthwhile_without_simd_p (loop_vinfo, code))
+	    ok = false;
+	  else
+	    if (dump_enabled_p ())
+	      dump_printf (MSG_NOTE, "proceeding using word mode.\n");
+        }
+
+      /* Worthwhile without SIMD support?  */
+      if (ok
+	  && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
+	  && !vect_worthwhile_without_simd_p (loop_vinfo, code))
+        {
+          if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "not worthwhile without SIMD support.\n");
+	  ok = false;
+        }
+
+      /* lane-reducing operations have to go through vect_transform_reduction.
+         For the other cases try without the single cycle optimization.  */
+      if (!ok)
+	{
+	  if (lane_reduc_code_p)
+	    return false;
+	  else
+	    single_defuse_cycle = false;
+	}
     }
-  else
-    epilog_copies = ncopies;
+  STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
 
   /* If the reduction stmt is one of the patterns that have lane
      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
-  if ((ncopies > 1
-       && ! single_defuse_cycle)
-      && (code == DOT_PROD_EXPR
-	  || code == WIDEN_SUM_EXPR
-	  || code == SAD_EXPR))
+  if ((ncopies > 1 && ! single_defuse_cycle)
+      && lane_reduc_code_p)
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -6958,46 +6567,130 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   else
     vec_num = 1;
 
+  vect_model_reduction_cost (stmt_info, reduc_fn, reduction_type, ncopies,
+			     cost_vec);
+  if (dump_enabled_p ()
+      && reduction_type == FOLD_LEFT_REDUCTION)
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "using an in-order (fold-left) reduction.\n");
+  STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
+  /* All but single defuse-cycle optimized, lane-reducing and fold-left
+     reductions go through their own vectorizable_* routines.  */
+  if (!single_defuse_cycle
+      && code != DOT_PROD_EXPR
+      && code != WIDEN_SUM_EXPR
+      && code != SAD_EXPR
+      && reduction_type != FOLD_LEFT_REDUCTION)
+    {
+      stmt_vec_info tem
+	= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
+      if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
+	{
+	  gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
+	  tem = REDUC_GROUP_FIRST_ELEMENT (tem);
+	}
+      STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
+      STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
+    }
+  else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
+    {
+      vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+      internal_fn cond_fn = get_conditional_internal_fn (code);
+
+      if (reduction_type != FOLD_LEFT_REDUCTION
+	  && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
+	  && (cond_fn == IFN_LAST
+	      || !direct_internal_fn_supported_p (cond_fn, vectype_in,
+						  OPTIMIZE_FOR_SPEED)))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "can't use a fully-masked loop because no"
+			     " conditional operation is available.\n");
+	  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+	}
+      else
+	vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
+			       vectype_in, NULL);
+    }
+  return true;
+}
+
+/* Transform the definition stmt STMT_INFO of a reduction PHI backedge
+   value.  */
+
+bool
+vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
+			  stmt_vec_info *vec_stmt, slp_tree slp_node)
+{
+  tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  int i;
+  int ncopies;
+  int j;
+  int vec_num;
+
+  stmt_vec_info reduc_info = info_for_reduction (stmt_info);
+  gcc_assert (reduc_info->is_reduc_info);
+
+  if (nested_in_vect_loop_p (loop, stmt_info))
+    {
+      loop = loop->inner;
+      gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
+    }
+
+  gassign *stmt = as_a <gassign *> (stmt_info->stmt);
+  enum tree_code code = gimple_assign_rhs_code (stmt);
+  int op_type = TREE_CODE_LENGTH (code);
+
+  /* Flatten RHS.  */
+  tree ops[3];
+  switch (get_gimple_rhs_class (code))
+    {
+    case GIMPLE_TERNARY_RHS:
+      ops[2] = gimple_assign_rhs3 (stmt);
+      /* Fall thru.  */
+    case GIMPLE_BINARY_RHS:
+      ops[0] = gimple_assign_rhs1 (stmt);
+      ops[1] = gimple_assign_rhs2 (stmt);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  /* All uses but the last are expected to be defined in the loop.
+     The last use is the reduction variable.  In case of nested cycle this
+     assumption is not true: we use reduc_index to record the index of the
+     reduction variable.  */
+  stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
+  gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
+  int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
+  tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
+
+  if (slp_node)
+    {
+      ncopies = 1;
+      vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+    }
+  else
+    {
+      ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
+      vec_num = 1;
+    }
+
   internal_fn cond_fn = get_conditional_internal_fn (code);
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
-
-  if (!vec_stmt) /* transformation not required.  */
-    {
-      vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
-      if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
-	{
-	  if (reduction_type != FOLD_LEFT_REDUCTION
-	      && (cond_fn == IFN_LAST
-		  || !direct_internal_fn_supported_p (cond_fn, vectype_in,
-						      OPTIMIZE_FOR_SPEED)))
-	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				 "can't use a fully-masked loop because no"
-				 " conditional operation is available.\n");
-	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
-	    }
-	  else if (reduc_index == -1)
-	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				 "can't use a fully-masked loop for chained"
-				 " reductions.\n");
-	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
-	    }
-	  else
-	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
-				   vectype_in);
-	}
-      if (dump_enabled_p ()
-	  && reduction_type == FOLD_LEFT_REDUCTION)
-	dump_printf_loc (MSG_NOTE, vect_location,
-			 "using an in-order (fold-left) reduction.\n");
-      STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
-      return true;
-    }
+  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
 
   /* Transform.  */
+  stmt_vec_info new_stmt_info = NULL;
+  stmt_vec_info prev_stmt_info;
+  tree new_temp = NULL_TREE;
+  auto_vec<tree> vec_oprnds0;
+  auto_vec<tree> vec_oprnds1;
+  auto_vec<tree> vec_oprnds2;
+  tree def0;
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
@@ -7008,23 +6701,26 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 
   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 
+  vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
   if (reduction_type == FOLD_LEFT_REDUCTION)
-    return vectorize_fold_left_reduction
-      (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
-       reduc_fn, ops, vectype_in, reduc_index, masks);
-
-  if (reduction_type == EXTRACT_LAST_REDUCTION)
     {
-      gcc_assert (!slp_node);
-      return vectorizable_condition (stmt_info, gsi, vec_stmt,
-				     true, NULL, NULL);
+      internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
+      return vectorize_fold_left_reduction
+	  (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
+	   reduc_fn, ops, vectype_in, reduc_index, masks);
     }
 
+  bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
+  gcc_assert (single_defuse_cycle
+	      || code == DOT_PROD_EXPR
+	      || code == WIDEN_SUM_EXPR
+	      || code == SAD_EXPR);
+
   /* Create the destination vector  */
-  vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
+  tree scalar_dest = gimple_assign_lhs (stmt);
+  tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
 
   prev_stmt_info = NULL;
-  prev_phi_info = NULL;
   if (!slp_node)
     {
       vec_oprnds0.create (1);
@@ -7033,32 +6729,8 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
         vec_oprnds2.create (1);
     }
 
-  phis.create (vec_num);
-  vect_defs.create (vec_num);
-  if (!slp_node)
-    vect_defs.quick_push (NULL_TREE);
-
-  if (slp_node)
-    phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
-  else
-    phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
-
   for (j = 0; j < ncopies; j++)
     {
-      if (code == COND_EXPR)
-        {
-          gcc_assert (!slp_node);
-	  vectorizable_condition (stmt_info, gsi, vec_stmt,
-				  true, NULL, NULL);
-          break;
-        }
-      if (code == LSHIFT_EXPR
-	  || code == RSHIFT_EXPR)
-	{
-	  vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
-	  break;
-	}
-
       /* Handle uses.  */
       if (j == 0)
         {
@@ -7066,16 +6738,8 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	    {
 	      /* Get vec defs for all the operands except the reduction index,
 		 ensuring the ordering of the ops in the vector is kept.  */
-	      auto_vec<tree, 3> slp_ops;
 	      auto_vec<vec<tree>, 3> vec_defs;
-
-	      slp_ops.quick_push (ops[0]);
-	      slp_ops.quick_push (ops[1]);
-	      if (op_type == ternary_op)
-		slp_ops.quick_push (ops[2]);
-
-	      vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
-
+	      vect_get_slp_defs (slp_node, &vec_defs);
 	      vec_oprnds0.safe_splice (vec_defs[0]);
 	      vec_defs[0].release ();
 	      vec_oprnds1.safe_splice (vec_defs[1]);
@@ -7130,7 +6794,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
         {
 	  tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
-	  if (masked_loop_p)
+	  if (masked_loop_p && !mask_by_cond_expr)
 	    {
 	      /* Make sure that the reduction accumulator is vop[0].  */
 	      if (reduc_index == 1)
@@ -7154,6 +6818,14 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	      if (op_type == ternary_op)
 		vop[2] = vec_oprnds2[i];
 
+	      if (masked_loop_p && mask_by_cond_expr)
+		{
+		  tree mask = vect_get_loop_mask (gsi, masks,
+						  vec_num * ncopies,
+						  vectype_in, i * ncopies + j);
+		  build_vect_cond_expr (code, vop, mask, gsi);
+		}
+
 	      gassign *new_stmt = gimple_build_assign (vec_dest, code,
 						       vop[0], vop[1], vop[2]);
 	      new_temp = make_ssa_name (vec_dest, new_stmt);
@@ -7163,15 +6835,10 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	    }
 
           if (slp_node)
-            {
-	      SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
-              vect_defs.quick_push (new_temp);
-            }
-          else
-            vect_defs[0] = new_temp;
+	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
         }
 
-      if (slp_node)
+      if (slp_node || single_defuse_cycle)
         continue;
 
       if (j == 0)
@@ -7182,20 +6849,244 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       prev_stmt_info = new_stmt_info;
     }
 
-  /* Finalize the reduction-phi (set its arguments) and create the
-     epilog reduction code.  */
-  if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
-    vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
+  if (single_defuse_cycle && !slp_node)
+    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
+
+  return true;
+}
+
+/* Transform phase of a cycle PHI.  */
+
+bool
+vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
+			  slp_tree slp_node, slp_instance slp_node_instance)
+{
+  tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  int i;
+  int ncopies;
+  stmt_vec_info prev_phi_info;
+  int j;
+  bool nested_cycle = false;
+  int vec_num;
+
+  if (nested_in_vect_loop_p (loop, stmt_info))
+    {
+      loop = loop->inner;
+      nested_cycle = true;
+    }
+
+  stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
+  reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
+  stmt_vec_info reduc_info = info_for_reduction (stmt_info);
+  gcc_assert (reduc_info->is_reduc_info);
+
+  if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
+      || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
+    /* Leave the scalar phi in place.  */
+    return true;
+
+  tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
+  /* For a nested cycle we do not fill the above.  */
+  if (!vectype_in)
+    vectype_in = STMT_VINFO_VECTYPE (stmt_info);
+  gcc_assert (vectype_in);
+
+  if (slp_node)
+    {
+      /* The size vect_schedule_slp_instance computes is off for us.  */
+      vec_num = vect_get_num_vectors
+	  (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+	   * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
+      ncopies = 1;
+    }
+  else
+    {
+      vec_num = 1;
+      ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
+    }
+
+  /* Check whether we should use a single PHI node and accumulate
+     vectors to one before the backedge.  */
+  if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
+    ncopies = 1;
+
+  /* Create the destination vector  */
+  gphi *phi = as_a <gphi *> (stmt_info->stmt);
+  tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
+					       vectype_out);
+
+  /* Get the loop-entry arguments.  */
+  tree vec_initial_def;
+  auto_vec<tree> vec_initial_defs;
+  if (slp_node)
+    {
+      vec_initial_defs.reserve (vec_num);
+      gcc_assert (slp_node == slp_node_instance->reduc_phis);
+      stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
+      tree neutral_op
+	= neutral_op_for_slp_reduction (slp_node, vectype_out,
+					STMT_VINFO_REDUC_CODE (reduc_info),
+					first != NULL);
+      get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
+				      &vec_initial_defs, vec_num,
+				      first != NULL, neutral_op);
+    }
+  else
+    {
+      /* Get at the scalar def before the loop, that defines the initial
+	 value of the reduction variable.  */
+      tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
+						loop_preheader_edge (loop));
+      /* Optimize: if initial_def is for REDUC_MAX smaller than the base
+	 and we can't use zero for induc_val, use initial_def.  Similarly
+	 for REDUC_MIN and initial_def larger than the base.  */
+      if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
+	{
+	  tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
+	  if (TREE_CODE (initial_def) == INTEGER_CST
+	      && !integer_zerop (induc_val)
+	      && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
+		   && tree_int_cst_lt (initial_def, induc_val))
+		  || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
+		      && tree_int_cst_lt (induc_val, initial_def))))
+	    {
+	      induc_val = initial_def;
+	      /* Communicate we used the initial_def to epilouge
+		 generation.  */
+	      STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
+	    }
+	  vec_initial_def = build_vector_from_val (vectype_out, induc_val);
+	}
+      else if (nested_cycle)
+	{
+	  /* Do not use an adjustment def as that case is not supported
+	     correctly if ncopies is not one.  */
+	  vec_initial_def = vect_get_vec_def_for_operand (initial_def,
+							  reduc_stmt_info);
+	}
+      else
+	{
+	  tree adjustment_def = NULL_TREE;
+	  tree *adjustment_defp = &adjustment_def;
+	  enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
+	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
+	    adjustment_defp = NULL;
+	  vec_initial_def
+	    = get_initial_def_for_reduction (reduc_stmt_info, code,
+					     initial_def, adjustment_defp);
+	  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
+	}
+      vec_initial_defs.create (1);
+      vec_initial_defs.quick_push (vec_initial_def);
+    }
+
+  /* Generate the reduction PHIs upfront.  */
+  prev_phi_info = NULL;
+  for (i = 0; i < vec_num; i++)
+    {
+      tree vec_init_def = vec_initial_defs[i];
+      for (j = 0; j < ncopies; j++)
+	{
+	  /* Create the reduction-phi that defines the reduction
+	     operand.  */
+	  gphi *new_phi = create_phi_node (vec_dest, loop->header);
+	  stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
+
+	  /* Set the loop-entry arg of the reduction-phi.  */
+	  if (j != 0 && nested_cycle)
+	    vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
+							   vec_init_def);
+	  add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
+		       UNKNOWN_LOCATION);
+
+	  /* The loop-latch arg is set in epilogue processing.  */
+
+	  if (slp_node)
+	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
+	  else
+	    {
+	      if (j == 0)
+		STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
+	      else
+		STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
+	      prev_phi_info = new_phi_info;
+	    }
+	}
+    }
+
+  return true;
+}
+
+/* Vectorizes LC PHIs.  */
+
+bool
+vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
+		     slp_tree slp_node)
+{
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  if (!loop_vinfo
+      || !is_a <gphi *> (stmt_info->stmt)
+      || gimple_phi_num_args (stmt_info->stmt) != 1)
+    return false;
+
+  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
+      && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
+    return false;
+
+  if (!vec_stmt) /* transformation not required.  */
+    {
+      STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
+      return true;
+    }
 
-  vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
-				    epilog_copies, reduc_fn, phis,
-				    double_reduc, slp_node, slp_node_instance,
-				    cond_reduc_val, cond_reduc_op_code,
-				    neutral_op);
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  tree scalar_dest = gimple_phi_result (stmt_info->stmt);
+  basic_block bb = gimple_bb (stmt_info->stmt);
+  edge e = single_pred_edge (bb);
+  tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
+  vec<tree> vec_oprnds = vNULL;
+  vect_get_vec_defs (gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
+		     stmt_info, &vec_oprnds, NULL, slp_node);
+  if (slp_node)
+    {
+      unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+      gcc_assert (vec_oprnds.length () == vec_num);
+      for (unsigned i = 0; i < vec_num; i++)
+	{
+	  /* Create the vectorized LC PHI node.  */
+	  gphi *new_phi = create_phi_node (vec_dest, bb);
+	  add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
+	  stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
+	  SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
+	}
+    }
+  else
+    {
+      unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
+      stmt_vec_info prev_phi_info = NULL;
+      for (unsigned i = 0; i < ncopies; i++)
+	{
+	  if (i != 0)
+	    vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
+	  /* Create the vectorized LC PHI node.  */
+	  gphi *new_phi = create_phi_node (vec_dest, bb);
+	  add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
+	  stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
+	  if (i == 0)
+	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
+	  else
+	    STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
+	  prev_phi_info = new_phi_info;
+	}
+    }
+  vec_oprnds.release ();
 
   return true;
 }
 
+
 /* Function vect_min_worthwhile_factor.
 
    For a loop where we could vectorize the operation indicated by CODE,
@@ -7789,8 +7680,8 @@ vectorizable_induction (stmt_vec_info stmt_info,
 bool
 vectorizable_live_operation (stmt_vec_info stmt_info,
 			     gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
-			     slp_tree slp_node, int slp_index,
-			     stmt_vec_info *vec_stmt,
+			     slp_tree slp_node, slp_instance slp_node_instance,
+			     int slp_index, stmt_vec_info *vec_stmt,
 			     stmt_vector_for_cost *)
 {
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
@@ -7807,8 +7698,33 @@ vectorizable_live_operation (stmt_vec_info stmt_info,
 
   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
 
-  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
-    return false;
+  /* If a stmt of a reduction is live, vectorize it via
+     vect_create_epilog_for_reduction.  vectorizable_reduction assessed
+     validity so just trigger the transform here.  */
+  if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
+    {
+      if (!vec_stmt)
+	return true;
+      if (slp_node)
+	{
+	  /* For reduction chains the meta-info is attached to
+	     the group leader.  */
+	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
+	    stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
+	  /* For SLP reductions we vectorize the epilogue for
+	     all involved stmts together.  */
+	  else if (slp_index != 0)
+	    return true;
+	}
+      stmt_vec_info reduc_info = info_for_reduction (stmt_info);
+      gcc_assert (reduc_info->is_reduc_info);
+      if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
+	  || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
+	return true;
+      vect_create_epilog_for_reduction (stmt_info, slp_node,
+					slp_node_instance);
+      return true;
+    }
 
   /* FORNOW.  CHECKME.  */
   if (nested_in_vect_loop_p (loop, stmt_info))
@@ -7892,7 +7808,7 @@ vectorizable_live_operation (stmt_vec_info stmt_info,
 	      gcc_assert (ncopies == 1 && !slp_node);
 	      vect_record_loop_mask (loop_vinfo,
 				     &LOOP_VINFO_MASKS (loop_vinfo),
-				     1, vectype);
+				     1, vectype, NULL);
 	    }
 	}
       return true;
@@ -8071,31 +7987,34 @@ loop_niters_no_overflow (loop_vec_info loop_vinfo)
   return false;
 }
 
-/* Return a mask type with half the number of elements as TYPE.  */
+/* Return a mask type with half the number of elements as OLD_TYPE,
+   given that it should have mode NEW_MODE.  */
 
 tree
-vect_halve_mask_nunits (tree type)
+vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
 {
-  poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
-  return build_truth_vector_type (nunits, current_vector_size);
+  poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
+  return build_truth_vector_type_for_mode (nunits, new_mode);
 }
 
-/* Return a mask type with twice as many elements as TYPE.  */
+/* Return a mask type with twice as many elements as OLD_TYPE,
+   given that it should have mode NEW_MODE.  */
 
 tree
-vect_double_mask_nunits (tree type)
+vect_double_mask_nunits (tree old_type, machine_mode new_mode)
 {
-  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
-  return build_truth_vector_type (nunits, current_vector_size);
+  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
+  return build_truth_vector_type_for_mode (nunits, new_mode);
 }
 
 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
    contain a sequence of NVECTORS masks that each control a vector of type
-   VECTYPE.  */
+   VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
+   these vector masks with the vector version of SCALAR_MASK.  */
 
 void
 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
-		       unsigned int nvectors, tree vectype)
+		       unsigned int nvectors, tree vectype, tree scalar_mask)
 {
   gcc_assert (nvectors != 0);
   if (masks->length () < nvectors)
@@ -8106,10 +8025,17 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
   unsigned int nscalars_per_iter
     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
 		 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
+
+  if (scalar_mask)
+    {
+      scalar_cond_masked_key cond (scalar_mask, nvectors);
+      loop_vinfo->scalar_cond_masked_set.add (cond);
+    }
+
   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
     {
       rgm->max_nscalars_per_iter = nscalars_per_iter;
-      rgm->mask_type = build_same_sized_truth_vector_type (vectype);
+      rgm->mask_type = truth_type_for (vectype);
     }
 }
 
@@ -8154,7 +8080,7 @@ vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
 			      TYPE_VECTOR_SUBPARTS (vectype)));
       gimple_seq seq = NULL;
-      mask_type = build_same_sized_truth_vector_type (vectype);
+      mask_type = truth_type_for (vectype);
       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
       if (seq)
 	gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
@@ -8242,6 +8168,186 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
     *seen_store = stmt_info;
 }
 
+/* Helper function to pass to simplify_replace_tree to enable replacing tree's
+   in the hash_map with its corresponding values.  */
+
+static tree
+find_in_mapping (tree t, void *context)
+{
+  hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
+
+  tree *value = mapping->get (t);
+  return value ? *value : t;
+}
+
+/* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
+   original loop that has now been vectorized.
+
+   The inits of the data_references need to be advanced with the number of
+   iterations of the main loop.  This has been computed in vect_do_peeling and
+   is stored in parameter ADVANCE.  We first restore the data_references
+   initial offset with the values recored in ORIG_DRS_INIT.
+
+   Since the loop_vec_info of this EPILOGUE was constructed for the original
+   loop, its stmt_vec_infos all point to the original statements.  These need
+   to be updated to point to their corresponding copies as well as the SSA_NAMES
+   in their PATTERN_DEF_SEQs and RELATED_STMTs.
+
+   The data_reference's connections also need to be updated.  Their
+   corresponding dr_vec_info need to be reconnected to the EPILOGUE's
+   stmt_vec_infos, their statements need to point to their corresponding copy,
+   if they are gather loads or scatter stores then their reference needs to be
+   updated to point to its corresponding copy and finally we set
+   'base_misaligned' to false as we have already peeled for alignment in the
+   prologue of the main loop.  */
+
+static void
+update_epilogue_loop_vinfo (class loop *epilogue, tree advance,
+			    drs_init_vec &orig_drs_init)
+{
+  loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
+  auto_vec<gimple *> stmt_worklist;
+  hash_map<tree,tree> mapping;
+  gimple *orig_stmt, *new_stmt;
+  gimple_stmt_iterator epilogue_gsi;
+  gphi_iterator epilogue_phi_gsi;
+  stmt_vec_info stmt_vinfo = NULL, related_vinfo;
+  basic_block *epilogue_bbs = get_loop_body (epilogue);
+
+  LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
+
+  /* Restore original data_reference's offset, before the previous loop and its
+     prologue.  */
+  std::pair<data_reference*, tree> *dr_init;
+  unsigned i;
+  for (i = 0; orig_drs_init.iterate (i, &dr_init); i++)
+    DR_OFFSET (dr_init->first) = dr_init->second;
+
+  /* Advance data_reference's with the number of iterations of the previous
+     loop and its prologue.  */
+  vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
+
+
+  /* The EPILOGUE loop is a copy of the original loop so they share the same
+     gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
+     point to the copied statements.  We also create a mapping of all LHS' in
+     the original loop and all the LHS' in the EPILOGUE and create worklists to
+     update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
+  for (unsigned i = 0; i < epilogue->num_nodes; ++i)
+    {
+      for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
+	   !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
+	{
+	  new_stmt = epilogue_phi_gsi.phi ();
+
+	  gcc_assert (gimple_uid (new_stmt) > 0);
+	  stmt_vinfo
+	    = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
+
+	  orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
+	  STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
+
+	  mapping.put (gimple_phi_result (orig_stmt),
+		       gimple_phi_result (new_stmt));
+	  /* PHI nodes can not have patterns or related statements.  */
+	  gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
+		      && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
+	}
+
+      for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
+	   !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
+	{
+	  new_stmt = gsi_stmt (epilogue_gsi);
+
+	  gcc_assert (gimple_uid (new_stmt) > 0);
+	  stmt_vinfo
+	    = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
+
+	  orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
+	  STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
+
+	  if (tree old_lhs = gimple_get_lhs (orig_stmt))
+	    mapping.put (old_lhs, gimple_get_lhs (new_stmt));
+
+	  if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
+	    {
+	      gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
+	      for (gimple_stmt_iterator gsi = gsi_start (seq);
+		   !gsi_end_p (gsi); gsi_next (&gsi))
+		stmt_worklist.safe_push (gsi_stmt (gsi));
+	    }
+
+	  related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
+	  if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
+	    {
+	      gimple *stmt = STMT_VINFO_STMT (related_vinfo);
+	      stmt_worklist.safe_push (stmt);
+	      /* Set BB such that the assert in
+		'get_initial_def_for_reduction' is able to determine that
+		the BB of the related stmt is inside this loop.  */
+	      gimple_set_bb (stmt,
+			     gimple_bb (new_stmt));
+	      related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
+	      gcc_assert (related_vinfo == NULL
+			  || related_vinfo == stmt_vinfo);
+	    }
+	}
+    }
+
+  /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
+     using the original main loop and thus need to be updated to refer to the
+     cloned variables used in the epilogue.  */
+  for (unsigned i = 0; i < stmt_worklist.length (); ++i)
+    {
+      gimple *stmt = stmt_worklist[i];
+      tree *new_op;
+
+      for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
+	{
+	  tree op = gimple_op (stmt, j);
+	  if ((new_op = mapping.get(op)))
+	    gimple_set_op (stmt, j, *new_op);
+	  else
+	    {
+	      op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
+				     &find_in_mapping, &mapping);
+	      gimple_set_op (stmt, j, op);
+	    }
+	}
+    }
+
+  struct data_reference *dr;
+  vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs;
+  FOR_EACH_VEC_ELT (datarefs, i, dr)
+    {
+      orig_stmt = DR_STMT (dr);
+      gcc_assert (gimple_uid (orig_stmt) > 0);
+      stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
+      /* Data references for gather loads and scatter stores do not use the
+	 updated offset we set using ADVANCE.  Instead we have to make sure the
+	 reference in the data references point to the corresponding copy of
+	 the original in the epilogue.  */
+      if (STMT_VINFO_GATHER_SCATTER_P (stmt_vinfo))
+	{
+	  DR_REF (dr)
+	    = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
+				     &find_in_mapping, &mapping);
+	  DR_BASE_ADDRESS (dr)
+	    = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
+				     &find_in_mapping, &mapping);
+	}
+      DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
+      stmt_vinfo->dr_aux.stmt = stmt_vinfo;
+      /* The vector size of the epilogue is smaller than that of the main loop
+	 so the alignment is either the same or lower. This means the dr will
+	 thus by definition be aligned.  */
+      STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
+    }
+
+  epilogue_vinfo->shared->datarefs_copy.release ();
+  epilogue_vinfo->shared->save_datarefs ();
+}
+
 /* Function vect_transform_loop.
 
    The analysis phase has determined that the loop is vectorizable.
@@ -8279,11 +8385,11 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   if (th >= vect_vf_for_cost (loop_vinfo)
       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
     {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_NOTE, vect_location,
-			 "Profitability threshold is %d loop iterations.\n",
-                         th);
-      check_profitability = true;
+	if (dump_enabled_p ())
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "Profitability threshold is %d loop iterations.\n",
+			   th);
+	check_profitability = true;
     }
 
   /* Make sure there exists a single-predecessor exit bb.  Do this before 
@@ -8301,18 +8407,8 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 
   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
     {
-      poly_uint64 versioning_threshold
-	= LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
-      if (check_profitability
-	  && ordered_p (poly_uint64 (th), versioning_threshold))
-	{
-	  versioning_threshold = ordered_max (poly_uint64 (th),
-					      versioning_threshold);
-	  check_profitability = false;
-	}
       struct loop *sloop
-	= vect_loop_versioning (loop_vinfo, th, check_profitability,
-				versioning_threshold);
+	= vect_loop_versioning (loop_vinfo);
       sloop->force_vectorize = false;
       check_profitability = false;
     }
@@ -8337,9 +8433,13 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
+  tree advance;
+  drs_init_vec orig_drs_init;
+
   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
 			      &step_vector, &niters_vector_mult_vf, th,
-			      check_profitability, niters_no_overflow);
+			      check_profitability, niters_no_overflow,
+			      &advance, orig_drs_init);
 
   if (niters_vector == NULL_TREE)
     {
@@ -8413,7 +8513,9 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 
 	  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
 	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
-	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
+	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
+	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
+	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
 	      && ! PURE_SLP_STMT (stmt_info))
 	    {
 	      if (dump_enabled_p ())
@@ -8565,12 +8667,9 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 	  dump_printf (MSG_NOTE, "\n");
 	}
       else
-	{
-	  dump_printf_loc (MSG_NOTE, vect_location,
-			   "LOOP EPILOGUE VECTORIZED (VS=");
-	  dump_dec (MSG_NOTE, current_vector_size);
-	  dump_printf (MSG_NOTE, ")\n");
-	}
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
+			 GET_MODE_NAME (loop_vinfo->vector_mode));
     }
 
   /* Loops vectorized with a variable factor won't benefit from
@@ -8592,57 +8691,14 @@ vect_transform_loop (loop_vec_info loop_vinfo)
      since vectorized loop can have loop-carried dependencies.  */
   loop->safelen = 0;
 
-  /* Don't vectorize epilogue for epilogue.  */
-  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
-    epilogue = NULL;
-
-  if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
-    epilogue = NULL;
-
   if (epilogue)
     {
-      auto_vector_sizes vector_sizes;
-      targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
-      unsigned int next_size = 0;
-
-      /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
-         on niters already ajusted for the iterations of the prologue.  */
-      if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-	  && known_eq (vf, lowest_vf))
-	{
-	  unsigned HOST_WIDE_INT eiters
-	    = (LOOP_VINFO_INT_NITERS (loop_vinfo)
-	       - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
-	  eiters
-	    = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
-	  epilogue->nb_iterations_upper_bound = eiters - 1;
-	  epilogue->any_upper_bound = true;
-
-	  unsigned int ratio;
-	  while (next_size < vector_sizes.length ()
-		 && !(constant_multiple_p (current_vector_size,
-					   vector_sizes[next_size], &ratio)
-		      && eiters >= lowest_vf / ratio))
-	    next_size += 1;
-	}
-      else
-	while (next_size < vector_sizes.length ()
-	       && maybe_lt (current_vector_size, vector_sizes[next_size]))
-	  next_size += 1;
-
-      if (next_size == vector_sizes.length ())
-	epilogue = NULL;
-    }
+      update_epilogue_loop_vinfo (epilogue, advance, orig_drs_init);
 
-  if (epilogue)
-    {
+      epilogue->simduid = loop->simduid;
       epilogue->force_vectorize = loop->force_vectorize;
       epilogue->safelen = loop->safelen;
       epilogue->dont_vectorize = false;
-
-      /* We may need to if-convert epilogue to vectorize it.  */
-      if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
-	tree_if_conversion (epilogue);
     }
 
   return epilogue;
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
index badf4e7104e..6356ecd692f 100644
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -46,6 +46,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "cgraph.h"
 #include "omp-simd-clone.h"
 #include "predict.h"
+#include "tree-vector-builder.h"
+#include "vec-perm-indices.h"
 
 /* Return true if we have a useful VR_RANGE range for VAR, storing it
    in *MIN_VALUE and *MAX_VALUE if so.  Note the range in the dump files.  */
@@ -185,15 +187,15 @@ vect_get_external_def_edge (vec_info *vinfo, tree var)
    is nonnull.  */
 
 static bool
-vect_supportable_direct_optab_p (tree otype, tree_code code,
+vect_supportable_direct_optab_p (vec_info *vinfo, tree otype, tree_code code,
 				 tree itype, tree *vecotype_out,
 				 tree *vecitype_out = NULL)
 {
-  tree vecitype = get_vectype_for_scalar_type (itype);
+  tree vecitype = get_vectype_for_scalar_type (vinfo, itype);
   if (!vecitype)
     return false;
 
-  tree vecotype = get_vectype_for_scalar_type (otype);
+  tree vecotype = get_vectype_for_scalar_type (vinfo, otype);
   if (!vecotype)
     return false;
 
@@ -632,6 +634,7 @@ static bool
 vect_split_statement (stmt_vec_info stmt2_info, tree new_rhs,
 		      gimple *stmt1, tree vectype)
 {
+  vec_info *vinfo = stmt2_info->vinfo;
   if (is_pattern_stmt_p (stmt2_info))
     {
       /* STMT2_INFO is part of a pattern.  Get the statement to which
@@ -675,7 +678,7 @@ vect_split_statement (stmt_vec_info stmt2_info, tree new_rhs,
 	 two-statement pattern now.  */
       gcc_assert (!STMT_VINFO_RELATED_STMT (stmt2_info));
       tree lhs_type = TREE_TYPE (gimple_get_lhs (stmt2_info->stmt));
-      tree lhs_vectype = get_vectype_for_scalar_type (lhs_type);
+      tree lhs_vectype = get_vectype_for_scalar_type (vinfo, lhs_type);
       if (!lhs_vectype)
 	return false;
 
@@ -712,6 +715,8 @@ static tree
 vect_convert_input (stmt_vec_info stmt_info, tree type,
 		    vect_unpromoted_value *unprom, tree vectype)
 {
+  vec_info *vinfo = stmt_info->vinfo;
+
   /* Check for a no-op conversion.  */
   if (types_compatible_p (type, TREE_TYPE (unprom->op)))
     return unprom->op;
@@ -749,7 +754,7 @@ vect_convert_input (stmt_vec_info stmt_info, tree type,
 	     unsigned promotion.  */
 	  tree midtype = build_nonstandard_integer_type
 	    (TYPE_PRECISION (type), TYPE_UNSIGNED (unprom->type));
-	  tree vec_midtype = get_vectype_for_scalar_type (midtype);
+	  tree vec_midtype = get_vectype_for_scalar_type (vinfo, midtype);
 	  if (vec_midtype)
 	    {
 	      input = vect_recog_temp_ssa_var (midtype, NULL);
@@ -830,17 +835,8 @@ vect_convert_output (stmt_vec_info stmt_info, tree type, gimple *pattern_stmt,
 /* Return true if STMT_VINFO describes a reduction for which reassociation
    is allowed.  If STMT_INFO is part of a group, assume that it's part of
    a reduction chain and optimistically assume that all statements
-   except the last allow reassociation.  */
-
-static bool
-vect_reassociating_reduction_p (stmt_vec_info stmt_vinfo)
-{
-  return (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
-	  ? STMT_VINFO_REDUC_TYPE (stmt_vinfo) != FOLD_LEFT_REDUCTION
-	  : REDUC_GROUP_FIRST_ELEMENT (stmt_vinfo) != NULL);
-}
-
-/* As above, but also require it to have code CODE and to be a reduction
+   except the last allow reassociation.
+   Also require it to have code CODE and to be a reduction
    in the outermost loop.  When returning true, store the operands in
    *OP0_OUT and *OP1_OUT.  */
 
@@ -862,11 +858,19 @@ vect_reassociating_reduction_p (stmt_vec_info stmt_info, tree_code code,
   if (loop && nested_in_vect_loop_p (loop, stmt_info))
     return false;
 
-  if (!vect_reassociating_reduction_p (stmt_info))
+  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
+    {
+      if (needs_fold_left_reduction_p (TREE_TYPE (gimple_assign_lhs (assign)),
+				       code))
+	return false;
+    }
+  else if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) == NULL)
     return false;
 
   *op0_out = gimple_assign_rhs1 (assign);
   *op1_out = gimple_assign_rhs2 (assign);
+  if (commutative_tree_code (code) && STMT_VINFO_REDUC_IDX (stmt_info) == 0)
+    std::swap (*op0_out, *op1_out);
   return true;
 }
 
@@ -983,7 +987,7 @@ vect_recog_dot_prod_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
   vect_pattern_detected ("vect_recog_dot_prod_pattern", last_stmt);
 
   tree half_vectype;
-  if (!vect_supportable_direct_optab_p (type, DOT_PROD_EXPR, half_type,
+  if (!vect_supportable_direct_optab_p (vinfo, type, DOT_PROD_EXPR, half_type,
 					type_out, &half_vectype))
     return NULL;
 
@@ -1141,7 +1145,7 @@ vect_recog_sad_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
   vect_pattern_detected ("vect_recog_sad_pattern", last_stmt);
 
   tree half_vectype;
-  if (!vect_supportable_direct_optab_p (sum_type, SAD_EXPR, half_type,
+  if (!vect_supportable_direct_optab_p (vinfo, sum_type, SAD_EXPR, half_type,
 					type_out, &half_vectype))
     return NULL;
 
@@ -1187,6 +1191,7 @@ vect_recog_widen_op_pattern (stmt_vec_info last_stmt_info, tree *type_out,
 			     tree_code orig_code, tree_code wide_code,
 			     bool shift_p, const char *name)
 {
+  vec_info *vinfo = last_stmt_info->vinfo;
   gimple *last_stmt = last_stmt_info->stmt;
 
   vect_unpromoted_value unprom[2];
@@ -1206,8 +1211,8 @@ vect_recog_widen_op_pattern (stmt_vec_info last_stmt_info, tree *type_out,
 					    TYPE_UNSIGNED (half_type));
 
   /* Check target support  */
-  tree vectype = get_vectype_for_scalar_type (half_type);
-  tree vecitype = get_vectype_for_scalar_type (itype);
+  tree vectype = get_vectype_for_scalar_type (vinfo, half_type);
+  tree vecitype = get_vectype_for_scalar_type (vinfo, itype);
   enum tree_code dummy_code;
   int dummy_int;
   auto_vec<tree> dummy_vec;
@@ -1219,7 +1224,7 @@ vect_recog_widen_op_pattern (stmt_vec_info last_stmt_info, tree *type_out,
 					  &dummy_int, &dummy_vec))
     return NULL;
 
-  *type_out = get_vectype_for_scalar_type (type);
+  *type_out = get_vectype_for_scalar_type (vinfo, type);
   if (!*type_out)
     return NULL;
 
@@ -1271,6 +1276,7 @@ vect_recog_widen_mult_pattern (stmt_vec_info last_stmt_info, tree *type_out)
 static gimple *
 vect_recog_pow_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
 {
+  vec_info *vinfo = stmt_vinfo->vinfo;
   gimple *last_stmt = stmt_vinfo->stmt;
   tree base, exp;
   gimple *stmt;
@@ -1339,7 +1345,7 @@ vect_recog_pow_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
 		  if (node->simd_clones == NULL)
 		    return NULL;
 		}
-	      *type_out = get_vectype_for_scalar_type (TREE_TYPE (base));
+	      *type_out = get_vectype_for_scalar_type (vinfo, TREE_TYPE (base));
 	      if (!*type_out)
 		return NULL;
 	      tree def = vect_recog_temp_ssa_var (TREE_TYPE (base), NULL);
@@ -1364,7 +1370,7 @@ vect_recog_pow_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
       || (TREE_CODE (exp) == REAL_CST
           && real_equal (&TREE_REAL_CST (exp), &dconst2)))
     {
-      if (!vect_supportable_direct_optab_p (TREE_TYPE (base), MULT_EXPR,
+      if (!vect_supportable_direct_optab_p (vinfo, TREE_TYPE (base), MULT_EXPR,
 					    TREE_TYPE (base), type_out))
 	return NULL;
 
@@ -1377,7 +1383,7 @@ vect_recog_pow_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
   if (TREE_CODE (exp) == REAL_CST
       && real_equal (&TREE_REAL_CST (exp), &dconsthalf))
     {
-      *type_out = get_vectype_for_scalar_type (TREE_TYPE (base));
+      *type_out = get_vectype_for_scalar_type (vinfo, TREE_TYPE (base));
       if (*type_out
 	  && direct_internal_fn_supported_p (IFN_SQRT, *type_out,
 					     OPTIMIZE_FOR_SPEED))
@@ -1470,8 +1476,8 @@ vect_recog_widen_sum_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
 
   vect_pattern_detected ("vect_recog_widen_sum_pattern", last_stmt);
 
-  if (!vect_supportable_direct_optab_p (type, WIDEN_SUM_EXPR, unprom0.type,
-					type_out))
+  if (!vect_supportable_direct_optab_p (vinfo, type, WIDEN_SUM_EXPR,
+					unprom0.type, type_out))
     return NULL;
 
   var = vect_recog_temp_ssa_var (type, NULL);
@@ -1662,7 +1668,7 @@ vect_recog_over_widening_pattern (stmt_vec_info last_stmt_info, tree *type_out)
 
   vect_pattern_detected ("vect_recog_over_widening_pattern", last_stmt);
 
-  *type_out = get_vectype_for_scalar_type (type);
+  *type_out = get_vectype_for_scalar_type (vinfo, type);
   if (!*type_out)
     return NULL;
 
@@ -1683,8 +1689,8 @@ vect_recog_over_widening_pattern (stmt_vec_info last_stmt_info, tree *type_out)
      wants to rewrite anyway.  If targets have a minimum element size
      for some optabs, we should pattern-match smaller ops to larger ops
      where beneficial.  */
-  tree new_vectype = get_vectype_for_scalar_type (new_type);
-  tree op_vectype = get_vectype_for_scalar_type (op_type);
+  tree new_vectype = get_vectype_for_scalar_type (vinfo, new_type);
+  tree op_vectype = get_vectype_for_scalar_type (vinfo, op_type);
   if (!new_vectype || !op_vectype)
     return NULL;
 
@@ -1842,7 +1848,7 @@ vect_recog_average_pattern (stmt_vec_info last_stmt_info, tree *type_out)
 					       TYPE_UNSIGNED (new_type));
 
   /* Check for target support.  */
-  tree new_vectype = get_vectype_for_scalar_type (new_type);
+  tree new_vectype = get_vectype_for_scalar_type (vinfo, new_type);
   if (!new_vectype
       || !direct_internal_fn_supported_p (ifn, new_vectype,
 					  OPTIMIZE_FOR_SPEED))
@@ -1850,7 +1856,7 @@ vect_recog_average_pattern (stmt_vec_info last_stmt_info, tree *type_out)
 
   /* The IR requires a valid vector type for the cast result, even though
      it's likely to be discarded.  */
-  *type_out = get_vectype_for_scalar_type (type);
+  *type_out = get_vectype_for_scalar_type (vinfo, type);
   if (!*type_out)
     return NULL;
 
@@ -1936,7 +1942,7 @@ vect_recog_cast_forwprop_pattern (stmt_vec_info last_stmt_info, tree *type_out)
      the unnecessary widening and narrowing.  */
   vect_pattern_detected ("vect_recog_cast_forwprop_pattern", last_stmt);
 
-  *type_out = get_vectype_for_scalar_type (lhs_type);
+  *type_out = get_vectype_for_scalar_type (vinfo, lhs_type);
   if (!*type_out)
     return NULL;
 
@@ -1996,24 +2002,107 @@ vect_recog_rotate_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
   enum vect_def_type dt;
   optab optab1, optab2;
   edge ext_def = NULL;
+  bool bswap16_p = false;
 
-  if (!is_gimple_assign (last_stmt))
-    return NULL;
+  if (is_gimple_assign (last_stmt))
+    {
+      rhs_code = gimple_assign_rhs_code (last_stmt);
+      switch (rhs_code)
+	{
+	case LROTATE_EXPR:
+	case RROTATE_EXPR:
+	  break;
+	default:
+	  return NULL;
+	}
 
-  rhs_code = gimple_assign_rhs_code (last_stmt);
-  switch (rhs_code)
+      lhs = gimple_assign_lhs (last_stmt);
+      oprnd0 = gimple_assign_rhs1 (last_stmt);
+      type = TREE_TYPE (oprnd0);
+      oprnd1 = gimple_assign_rhs2 (last_stmt);
+    }
+  else if (gimple_call_builtin_p (last_stmt, BUILT_IN_BSWAP16))
     {
-    case LROTATE_EXPR:
-    case RROTATE_EXPR:
-      break;
-    default:
-      return NULL;
+      /* __builtin_bswap16 (x) is another form of x r>> 8.
+	 The vectorizer has bswap support, but only if the argument isn't
+	 promoted.  */
+      lhs = gimple_call_lhs (last_stmt);
+      oprnd0 = gimple_call_arg (last_stmt, 0);
+      type = TREE_TYPE (oprnd0);
+      if (TYPE_PRECISION (TREE_TYPE (lhs)) != 16
+	  || TYPE_PRECISION (type) <= 16
+	  || TREE_CODE (oprnd0) != SSA_NAME
+	  || BITS_PER_UNIT != 8
+	  || !TYPE_UNSIGNED (TREE_TYPE (lhs)))
+	return NULL;
+
+      stmt_vec_info def_stmt_info;
+      if (!vect_is_simple_use (oprnd0, vinfo, &dt, &def_stmt_info, &def_stmt))
+	return NULL;
+
+      if (dt != vect_internal_def)
+	return NULL;
+
+      if (gimple_assign_cast_p (def_stmt))
+	{
+	  def = gimple_assign_rhs1 (def_stmt);
+	  if (INTEGRAL_TYPE_P (TREE_TYPE (def))
+	      && TYPE_PRECISION (TREE_TYPE (def)) == 16)
+	    oprnd0 = def;
+	}
+
+      type = TREE_TYPE (lhs);
+      vectype = get_vectype_for_scalar_type (vinfo, type);
+      if (vectype == NULL_TREE)
+	return NULL;
+
+      if (tree char_vectype = get_same_sized_vectype (char_type_node, vectype))
+	{
+	  /* The encoding uses one stepped pattern for each byte in the
+	     16-bit word.  */
+	  vec_perm_builder elts (TYPE_VECTOR_SUBPARTS (char_vectype), 2, 3);
+	  for (unsigned i = 0; i < 3; ++i)
+	    for (unsigned j = 0; j < 2; ++j)
+	      elts.quick_push ((i + 1) * 2 - j - 1);
+
+	  vec_perm_indices indices (elts, 1,
+				    TYPE_VECTOR_SUBPARTS (char_vectype));
+	  if (can_vec_perm_const_p (TYPE_MODE (char_vectype), indices))
+	    {
+	      /* vectorizable_bswap can handle the __builtin_bswap16 if we
+		 undo the argument promotion.  */
+	      if (!useless_type_conversion_p (type, TREE_TYPE (oprnd0)))
+		{
+		  def = vect_recog_temp_ssa_var (type, NULL);
+		  def_stmt = gimple_build_assign (def, NOP_EXPR, oprnd0);
+		  append_pattern_def_seq (stmt_vinfo, def_stmt);
+		  oprnd0 = def;
+		}
+
+	      /* Pattern detected.  */
+	      vect_pattern_detected ("vect_recog_rotate_pattern", last_stmt);
+
+	      *type_out = vectype;
+
+	      /* Pattern supported.  Create a stmt to be used to replace the
+		 pattern, with the unpromoted argument.  */
+	      var = vect_recog_temp_ssa_var (type, NULL);
+	      pattern_stmt = gimple_build_call (gimple_call_fndecl (last_stmt),
+						1, oprnd0);
+	      gimple_call_set_lhs (pattern_stmt, var);
+	      gimple_call_set_fntype (as_a <gcall *> (pattern_stmt),
+				      gimple_call_fntype (last_stmt));
+	      return pattern_stmt;
+	    }
+	}
+
+      oprnd1 = build_int_cst (integer_type_node, 8);
+      rhs_code = LROTATE_EXPR;
+      bswap16_p = true;
     }
+  else
+    return NULL;
 
-  lhs = gimple_assign_lhs (last_stmt);
-  oprnd0 = gimple_assign_rhs1 (last_stmt);
-  type = TREE_TYPE (oprnd0);
-  oprnd1 = gimple_assign_rhs2 (last_stmt);
   if (TREE_CODE (oprnd0) != SSA_NAME
       || TYPE_PRECISION (TREE_TYPE (lhs)) != TYPE_PRECISION (type)
       || !INTEGRAL_TYPE_P (type)
@@ -2029,7 +2118,7 @@ vect_recog_rotate_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
       && dt != vect_external_def)
     return NULL;
 
-  vectype = get_vectype_for_scalar_type (type);
+  vectype = get_vectype_for_scalar_type (vinfo, type);
   if (vectype == NULL_TREE)
     return NULL;
 
@@ -2038,14 +2127,39 @@ vect_recog_rotate_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
   optab1 = optab_for_tree_code (rhs_code, vectype, optab_vector);
   if (optab1
       && optab_handler (optab1, TYPE_MODE (vectype)) != CODE_FOR_nothing)
-    return NULL;
+    {
+     use_rotate:
+      if (bswap16_p)
+	{
+	  if (!useless_type_conversion_p (type, TREE_TYPE (oprnd0)))
+	    {
+	      def = vect_recog_temp_ssa_var (type, NULL);
+	      def_stmt = gimple_build_assign (def, NOP_EXPR, oprnd0);
+	      append_pattern_def_seq (stmt_vinfo, def_stmt);
+	      oprnd0 = def;
+	    }
+
+	  /* Pattern detected.  */
+	  vect_pattern_detected ("vect_recog_rotate_pattern", last_stmt);
+
+	  *type_out = vectype;
+
+	  /* Pattern supported.  Create a stmt to be used to replace the
+	     pattern.  */
+	  var = vect_recog_temp_ssa_var (type, NULL);
+	  pattern_stmt = gimple_build_assign (var, LROTATE_EXPR, oprnd0,
+					      oprnd1);
+	  return pattern_stmt;
+	}
+      return NULL;
+    }
 
   if (is_a <bb_vec_info> (vinfo) || dt != vect_internal_def)
     {
       optab2 = optab_for_tree_code (rhs_code, vectype, optab_scalar);
       if (optab2
 	  && optab_handler (optab2, TYPE_MODE (vectype)) != CODE_FOR_nothing)
-	return NULL;
+	goto use_rotate;
     }
 
   /* If vector/vector or vector/scalar shifts aren't supported by the target,
@@ -2070,6 +2184,14 @@ vect_recog_rotate_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
 
   *type_out = vectype;
 
+  if (bswap16_p && !useless_type_conversion_p (type, TREE_TYPE (oprnd0)))
+    {
+      def = vect_recog_temp_ssa_var (type, NULL);
+      def_stmt = gimple_build_assign (def, NOP_EXPR, oprnd0);
+      append_pattern_def_seq (stmt_vinfo, def_stmt);
+      oprnd0 = def;
+    }
+
   if (dt == vect_external_def && TREE_CODE (oprnd1) == SSA_NAME)
     ext_def = vect_get_external_def_edge (vinfo, oprnd1);
 
@@ -2106,7 +2228,7 @@ vect_recog_rotate_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
     }
   else
     {
-      tree vecstype = get_vectype_for_scalar_type (stype);
+      tree vecstype = get_vectype_for_scalar_type (vinfo, stype);
 
       if (vecstype == NULL_TREE)
 	return NULL;
@@ -2235,7 +2357,7 @@ vect_recog_vector_vector_shift_pattern (stmt_vec_info stmt_vinfo,
   if (!def_vinfo)
     return NULL;
 
-  *type_out = get_vectype_for_scalar_type (TREE_TYPE (oprnd0));
+  *type_out = get_vectype_for_scalar_type (vinfo, TREE_TYPE (oprnd0));
   if (*type_out == NULL_TREE)
     return NULL;
 
@@ -2258,7 +2380,8 @@ vect_recog_vector_vector_shift_pattern (stmt_vec_info stmt_vinfo,
 				       TYPE_PRECISION (TREE_TYPE (oprnd1)));
 	      def = vect_recog_temp_ssa_var (TREE_TYPE (rhs1), NULL);
 	      def_stmt = gimple_build_assign (def, BIT_AND_EXPR, rhs1, mask);
-	      tree vecstype = get_vectype_for_scalar_type (TREE_TYPE (rhs1));
+	      tree vecstype = get_vectype_for_scalar_type (vinfo,
+							   TREE_TYPE (rhs1));
 	      append_pattern_def_seq (stmt_vinfo, def_stmt, vecstype);
 	    }
 	}
@@ -2423,6 +2546,7 @@ static gimple *
 vect_synth_mult_by_constant (tree op, tree val,
 			     stmt_vec_info stmt_vinfo)
 {
+  vec_info *vinfo = stmt_vinfo->vinfo;
   tree itype = TREE_TYPE (op);
   machine_mode mode = TYPE_MODE (itype);
   struct algorithm alg;
@@ -2441,7 +2565,7 @@ vect_synth_mult_by_constant (tree op, tree val,
 
   /* Targets that don't support vector shifts but support vector additions
      can synthesize shifts that way.  */
-  bool synth_shift_p = !vect_supportable_shift (LSHIFT_EXPR, multtype);
+  bool synth_shift_p = !vect_supportable_shift (vinfo, LSHIFT_EXPR, multtype);
 
   HOST_WIDE_INT hwval = tree_to_shwi (val);
   /* Use MAX_COST here as we don't want to limit the sequence on rtx costs.
@@ -2452,7 +2576,7 @@ vect_synth_mult_by_constant (tree op, tree val,
   if (!possible)
     return NULL;
 
-  tree vectype = get_vectype_for_scalar_type (multtype);
+  tree vectype = get_vectype_for_scalar_type (vinfo, multtype);
 
   if (!vectype
       || !target_supports_mult_synth_alg (&alg, variant,
@@ -2598,6 +2722,7 @@ vect_synth_mult_by_constant (tree op, tree val,
 static gimple *
 vect_recog_mult_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
 {
+  vec_info *vinfo = stmt_vinfo->vinfo;
   gimple *last_stmt = stmt_vinfo->stmt;
   tree oprnd0, oprnd1, vectype, itype;
   gimple *pattern_stmt;
@@ -2618,7 +2743,7 @@ vect_recog_mult_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
       || !type_has_mode_precision_p (itype))
     return NULL;
 
-  vectype = get_vectype_for_scalar_type (itype);
+  vectype = get_vectype_for_scalar_type (vinfo, itype);
   if (vectype == NULL_TREE)
     return NULL;
 
@@ -2686,6 +2811,7 @@ vect_recog_mult_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
 static gimple *
 vect_recog_divmod_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
 {
+  vec_info *vinfo = stmt_vinfo->vinfo;
   gimple *last_stmt = stmt_vinfo->stmt;
   tree oprnd0, oprnd1, vectype, itype, cond;
   gimple *pattern_stmt, *def_stmt;
@@ -2718,7 +2844,7 @@ vect_recog_divmod_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
     return NULL;
 
   scalar_int_mode itype_mode = SCALAR_INT_TYPE_MODE (itype);
-  vectype = get_vectype_for_scalar_type (itype);
+  vectype = get_vectype_for_scalar_type (vinfo, itype);
   if (vectype == NULL_TREE)
     return NULL;
 
@@ -2785,7 +2911,7 @@ vect_recog_divmod_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
 	    {
 	      tree utype
 		= build_nonstandard_integer_type (prec, 1);
-	      tree vecutype = get_vectype_for_scalar_type (utype);
+	      tree vecutype = get_vectype_for_scalar_type (vinfo, utype);
 	      tree shift
 		= build_int_cst (utype, GET_MODE_BITSIZE (itype_mode)
 					- tree_log2 (oprnd1));
@@ -3104,6 +3230,7 @@ vect_recog_divmod_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
 static gimple *
 vect_recog_mixed_size_cond_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
 {
+  vec_info *vinfo = stmt_vinfo->vinfo;
   gimple *last_stmt = stmt_vinfo->stmt;
   tree cond_expr, then_clause, else_clause;
   tree type, vectype, comp_vectype, itype = NULL_TREE, vecitype;
@@ -3126,7 +3253,7 @@ vect_recog_mixed_size_cond_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
     return NULL;
 
   comp_scalar_type = TREE_TYPE (TREE_OPERAND (cond_expr, 0));
-  comp_vectype = get_vectype_for_scalar_type (comp_scalar_type);
+  comp_vectype = get_vectype_for_scalar_type (vinfo, comp_scalar_type);
   if (comp_vectype == NULL_TREE)
     return NULL;
 
@@ -3174,7 +3301,7 @@ vect_recog_mixed_size_cond_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
   if (GET_MODE_BITSIZE (type_mode) == cmp_mode_size)
     return NULL;
 
-  vectype = get_vectype_for_scalar_type (type);
+  vectype = get_vectype_for_scalar_type (vinfo, type);
   if (vectype == NULL_TREE)
     return NULL;
 
@@ -3189,7 +3316,7 @@ vect_recog_mixed_size_cond_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
       || GET_MODE_BITSIZE (SCALAR_TYPE_MODE (itype)) != cmp_mode_size)
     return NULL;
 
-  vecitype = get_vectype_for_scalar_type (itype);
+  vecitype = get_vectype_for_scalar_type (vinfo, itype);
   if (vecitype == NULL_TREE)
     return NULL;
 
@@ -3283,11 +3410,12 @@ check_bool_pattern (tree var, vec_info *vinfo, hash_set<gimple *> &stmts)
 	  if (stmt_could_throw_p (cfun, def_stmt))
 	    return false;
 
-	  comp_vectype = get_vectype_for_scalar_type (TREE_TYPE (rhs1));
+	  comp_vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (rhs1));
 	  if (comp_vectype == NULL_TREE)
 	    return false;
 
-	  tree mask_type = get_mask_type_for_scalar_type (TREE_TYPE (rhs1));
+	  tree mask_type = get_mask_type_for_scalar_type (vinfo,
+							  TREE_TYPE (rhs1));
 	  if (mask_type
 	      && expand_vec_cmp_expr_p (comp_vectype, mask_type, rhs_code))
 	    return false;
@@ -3297,7 +3425,7 @@ check_bool_pattern (tree var, vec_info *vinfo, hash_set<gimple *> &stmts)
 	      scalar_mode mode = SCALAR_TYPE_MODE (TREE_TYPE (rhs1));
 	      tree itype
 		= build_nonstandard_integer_type (GET_MODE_BITSIZE (mode), 1);
-	      vecitype = get_vectype_for_scalar_type (itype);
+	      vecitype = get_vectype_for_scalar_type (vinfo, itype);
 	      if (vecitype == NULL_TREE)
 		return false;
 	    }
@@ -3326,10 +3454,11 @@ check_bool_pattern (tree var, vec_info *vinfo, hash_set<gimple *> &stmts)
 static tree
 adjust_bool_pattern_cast (tree type, tree var, stmt_vec_info stmt_info)
 {
+  vec_info *vinfo = stmt_info->vinfo;
   gimple *cast_stmt = gimple_build_assign (vect_recog_temp_ssa_var (type, NULL),
 					   NOP_EXPR, var);
   append_pattern_def_seq (stmt_info, cast_stmt,
-			  get_vectype_for_scalar_type (type));
+			  get_vectype_for_scalar_type (vinfo, type));
   return gimple_assign_lhs (cast_stmt);
 }
 
@@ -3343,6 +3472,7 @@ static void
 adjust_bool_pattern (tree var, tree out_type,
 		     stmt_vec_info stmt_info, hash_map <tree, tree> &defs)
 {
+  vec_info *vinfo = stmt_info->vinfo;
   gimple *stmt = SSA_NAME_DEF_STMT (var);
   enum tree_code rhs_code, def_rhs_code;
   tree itype, cond_expr, rhs1, rhs2, irhs1, irhs2;
@@ -3504,7 +3634,7 @@ adjust_bool_pattern (tree var, tree out_type,
 
   gimple_set_location (pattern_stmt, loc);
   append_pattern_def_seq (stmt_info, pattern_stmt,
-			  get_vectype_for_scalar_type (itype));
+			  get_vectype_for_scalar_type (vinfo, itype));
   defs.put (var, gimple_assign_lhs (pattern_stmt));
 }
 
@@ -3607,14 +3737,14 @@ search_type_for_mask_1 (tree var, vec_info *vinfo,
 	      break;
 	    }
 
-	  comp_vectype = get_vectype_for_scalar_type (TREE_TYPE (rhs1));
+	  comp_vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (rhs1));
 	  if (comp_vectype == NULL_TREE)
 	    {
 	      res = NULL_TREE;
 	      break;
 	    }
 
-	  mask_type = get_mask_type_for_scalar_type (TREE_TYPE (rhs1));
+	  mask_type = get_mask_type_for_scalar_type (vinfo, TREE_TYPE (rhs1));
 	  if (!mask_type
 	      || !expand_vec_cmp_expr_p (comp_vectype, mask_type, rhs_code))
 	    {
@@ -3722,7 +3852,7 @@ vect_recog_bool_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
       if (! INTEGRAL_TYPE_P (TREE_TYPE (lhs))
 	  || TYPE_PRECISION (TREE_TYPE (lhs)) == 1)
 	return NULL;
-      vectype = get_vectype_for_scalar_type (TREE_TYPE (lhs));
+      vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (lhs));
       if (vectype == NULL_TREE)
 	return NULL;
 
@@ -3759,7 +3889,7 @@ vect_recog_bool_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
 
 	  if (!useless_type_conversion_p (type, TREE_TYPE (lhs)))
 	    {
-	      tree new_vectype = get_vectype_for_scalar_type (type);
+	      tree new_vectype = get_vectype_for_scalar_type (vinfo, type);
 	      append_pattern_def_seq (stmt_vinfo, pattern_stmt, new_vectype);
 
 	      lhs = vect_recog_temp_ssa_var (TREE_TYPE (lhs), NULL);
@@ -3775,7 +3905,7 @@ vect_recog_bool_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
   else if (rhs_code == COND_EXPR
 	   && TREE_CODE (var) == SSA_NAME)
     {
-      vectype = get_vectype_for_scalar_type (TREE_TYPE (lhs));
+      vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (lhs));
       if (vectype == NULL_TREE)
 	return NULL;
 
@@ -3789,7 +3919,7 @@ vect_recog_bool_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
       tree type
 	= build_nonstandard_integer_type (prec,
 					  TYPE_UNSIGNED (TREE_TYPE (var)));
-      if (get_vectype_for_scalar_type (type) == NULL_TREE)
+      if (get_vectype_for_scalar_type (vinfo, type) == NULL_TREE)
 	return NULL;
 
       if (!check_bool_pattern (var, vinfo, bool_stmts))
@@ -3833,7 +3963,7 @@ vect_recog_bool_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
 
 	  cst0 = build_int_cst (type, 0);
 	  cst1 = build_int_cst (type, 1);
-	  new_vectype = get_vectype_for_scalar_type (type);
+	  new_vectype = get_vectype_for_scalar_type (vinfo, type);
 
 	  rhs = vect_recog_temp_ssa_var (type, NULL);
 	  pattern_stmt = gimple_build_assign (rhs, COND_EXPR, var, cst1, cst0);
@@ -3874,7 +4004,7 @@ build_mask_conversion (tree mask, tree vectype, stmt_vec_info stmt_vinfo)
   gimple *stmt;
   tree masktype, tmp;
 
-  masktype = build_same_sized_truth_vector_type (vectype);
+  masktype = truth_type_for (vectype);
   tmp = vect_recog_temp_ssa_var (TREE_TYPE (masktype), NULL);
   stmt = gimple_build_assign (tmp, CONVERT_EXPR, mask);
   append_pattern_def_seq (stmt_vinfo, stmt, masktype);
@@ -3934,19 +4064,19 @@ vect_recog_mask_conversion_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
 	{
 	  int rhs_index = internal_fn_stored_value_index (ifn);
 	  tree rhs = gimple_call_arg (last_stmt, rhs_index);
-	  vectype1 = get_vectype_for_scalar_type (TREE_TYPE (rhs));
+	  vectype1 = get_vectype_for_scalar_type (vinfo, TREE_TYPE (rhs));
 	}
       else
 	{
 	  lhs = gimple_call_lhs (last_stmt);
-	  vectype1 = get_vectype_for_scalar_type (TREE_TYPE (lhs));
+	  vectype1 = get_vectype_for_scalar_type (vinfo, TREE_TYPE (lhs));
 	}
 
       tree mask_arg = gimple_call_arg (last_stmt, mask_argno);
       tree mask_arg_type = search_type_for_mask (mask_arg, vinfo);
       if (!mask_arg_type)
 	return NULL;
-      vectype2 = get_mask_type_for_scalar_type (mask_arg_type);
+      vectype2 = get_mask_type_for_scalar_type (vinfo, mask_arg_type);
 
       if (!vectype1 || !vectype2
 	  || known_eq (TYPE_VECTOR_SUBPARTS (vectype1),
@@ -3992,7 +4122,7 @@ vect_recog_mask_conversion_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
   /* Check for cond expression requiring mask conversion.  */
   if (rhs_code == COND_EXPR)
     {
-      vectype1 = get_vectype_for_scalar_type (TREE_TYPE (lhs));
+      vectype1 = get_vectype_for_scalar_type (vinfo, TREE_TYPE (lhs));
 
       if (TREE_CODE (rhs1) == SSA_NAME)
 	{
@@ -4023,7 +4153,7 @@ vect_recog_mask_conversion_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
       else
 	return NULL;
 
-      vectype2 = get_mask_type_for_scalar_type (rhs1_type);
+      vectype2 = get_mask_type_for_scalar_type (vinfo, rhs1_type);
 
       if (!vectype1 || !vectype2)
 	return NULL;
@@ -4058,7 +4188,8 @@ vect_recog_mask_conversion_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
 	      tree wide_scalar_type = build_nonstandard_integer_type
 		(tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype1))),
 		 TYPE_UNSIGNED (rhs1_type));
-	      tree vectype3 = get_vectype_for_scalar_type (wide_scalar_type);
+	      tree vectype3 = get_vectype_for_scalar_type (vinfo,
+							   wide_scalar_type);
 	      if (expand_vec_cond_expr_p (vectype1, vectype3, TREE_CODE (rhs1)))
 		return NULL;
 	    }
@@ -4113,14 +4244,14 @@ vect_recog_mask_conversion_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
 
   if (TYPE_PRECISION (rhs1_type) < TYPE_PRECISION (rhs2_type))
     {
-      vectype1 = get_mask_type_for_scalar_type (rhs1_type);
+      vectype1 = get_mask_type_for_scalar_type (vinfo, rhs1_type);
       if (!vectype1)
 	return NULL;
       rhs2 = build_mask_conversion (rhs2, vectype1, stmt_vinfo);
     }
   else
     {
-      vectype1 = get_mask_type_for_scalar_type (rhs2_type);
+      vectype1 = get_mask_type_for_scalar_type (vinfo, rhs2_type);
       if (!vectype1)
 	return NULL;
       rhs1 = build_mask_conversion (rhs1, vectype1, stmt_vinfo);
@@ -4191,7 +4322,7 @@ vect_convert_mask_for_vectype (tree mask, tree vectype,
   tree mask_type = search_type_for_mask (mask, vinfo);
   if (mask_type)
     {
-      tree mask_vectype = get_mask_type_for_scalar_type (mask_type);
+      tree mask_vectype = get_mask_type_for_scalar_type (vinfo, mask_type);
       if (mask_vectype
 	  && maybe_ne (TYPE_VECTOR_SUBPARTS (vectype),
 		       TYPE_VECTOR_SUBPARTS (mask_vectype)))
@@ -4214,10 +4345,11 @@ vect_add_conversion_to_pattern (tree type, tree value, stmt_vec_info stmt_info)
   if (useless_type_conversion_p (type, TREE_TYPE (value)))
     return value;
 
+  vec_info *vinfo = stmt_info->vinfo;
   tree new_value = vect_recog_temp_ssa_var (type, NULL);
   gassign *conversion = gimple_build_assign (new_value, CONVERT_EXPR, value);
   append_pattern_def_seq (stmt_info, conversion,
-			  get_vectype_for_scalar_type (type));
+			  get_vectype_for_scalar_type (vinfo, type));
   return new_value;
 }
 
@@ -4253,7 +4385,8 @@ vect_recog_gather_scatter_pattern (stmt_vec_info stmt_info, tree *type_out)
     return NULL;
 
   /* Convert the mask to the right form.  */
-  tree gs_vectype = get_vectype_for_scalar_type (gs_info.element_type);
+  tree gs_vectype = get_vectype_for_scalar_type (loop_vinfo,
+						 gs_info.element_type);
   if (mask)
     mask = vect_convert_mask_for_vectype (mask, gs_vectype, stmt_info,
 					  loop_vinfo);
@@ -4731,6 +4864,7 @@ static inline void
 vect_mark_pattern_stmts (stmt_vec_info orig_stmt_info, gimple *pattern_stmt,
                          tree pattern_vectype)
 {
+  stmt_vec_info orig_stmt_info_saved = orig_stmt_info;
   gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (orig_stmt_info);
 
   gimple *orig_pattern_stmt = NULL;
@@ -4765,6 +4899,9 @@ vect_mark_pattern_stmts (stmt_vec_info orig_stmt_info, gimple *pattern_stmt,
     for (gimple_stmt_iterator si = gsi_start (def_seq);
 	 !gsi_end_p (si); gsi_next (&si))
       {
+	if (dump_enabled_p ())
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "extra pattern stmt: %G", gsi_stmt (si));
 	stmt_vec_info pattern_stmt_info
 	  = vect_init_pattern_stmt (gsi_stmt (si),
 				    orig_stmt_info, pattern_vectype);
@@ -4790,6 +4927,60 @@ vect_mark_pattern_stmts (stmt_vec_info orig_stmt_info, gimple *pattern_stmt,
     }
   else
     vect_set_pattern_stmt (pattern_stmt, orig_stmt_info, pattern_vectype);
+
+  /* Transfer reduction path info to the pattern.  */
+  if (STMT_VINFO_REDUC_IDX (orig_stmt_info_saved) != -1)
+    {
+      vec_info *vinfo = orig_stmt_info_saved->vinfo;
+      tree lookfor = gimple_op (orig_stmt_info_saved->stmt,
+				1 + STMT_VINFO_REDUC_IDX (orig_stmt_info));
+      /* Search the pattern def sequence and the main pattern stmt.  Note
+         we may have inserted all into a containing pattern def sequence
+	 so the following is a bit awkward.  */
+      gimple_stmt_iterator si;
+      gimple *s;
+      if (def_seq)
+	{
+	  si = gsi_start (def_seq);
+	  s = gsi_stmt (si);
+	  gsi_next (&si);
+	}
+      else
+	{
+	  si = gsi_none ();
+	  s = pattern_stmt;
+	}
+      do
+	{
+	  bool found = false;
+	  for (unsigned i = 1; i < gimple_num_ops (s); ++i)
+	    if (gimple_op (s, i) == lookfor)
+	      {
+		STMT_VINFO_REDUC_IDX (vinfo->lookup_stmt (s)) = i - 1;
+		lookfor = gimple_get_lhs (s);
+		found = true;
+		break;
+	      }
+	  if (s == pattern_stmt)
+	    {
+	      if (!found && dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "failed to update reduction index.\n");
+	      break;
+	    }
+	  if (gsi_end_p (si))
+	    s = pattern_stmt;
+	  else
+	    {
+	      s = gsi_stmt (si);
+	      if (s == pattern_stmt)
+		/* Found the end inside a bigger pattern def seq.  */
+		si = gsi_none ();
+	      else
+		gsi_next (&si);
+	    }
+	} while (1);
+    }
 }
 
 /* Function vect_pattern_recog_1
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 2abf480270c..0bef35782b5 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -79,6 +79,7 @@ vect_free_slp_tree (slp_tree node, bool final_p)
 
   SLP_TREE_CHILDREN (node).release ();
   SLP_TREE_SCALAR_STMTS (node).release ();
+  SLP_TREE_SCALAR_OPS (node).release ();
   SLP_TREE_VEC_STMTS (node).release ();
   SLP_TREE_LOAD_PERMUTATION (node).release ();
 
@@ -122,6 +123,7 @@ vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts)
 
   node = XNEW (struct _slp_tree);
   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
+  SLP_TREE_SCALAR_OPS (node) = vNULL;
   SLP_TREE_VEC_STMTS (node).create (0);
   SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
   SLP_TREE_CHILDREN (node).create (nops);
@@ -138,6 +140,28 @@ vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts)
   return node;
 }
 
+/* Create an SLP node for OPS.  */
+
+static slp_tree
+vect_create_new_slp_node (vec<tree> ops)
+{
+  slp_tree node;
+
+  node = XNEW (struct _slp_tree);
+  SLP_TREE_SCALAR_STMTS (node) = vNULL;
+  SLP_TREE_SCALAR_OPS (node) = ops;
+  SLP_TREE_VEC_STMTS (node).create (0);
+  SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
+  SLP_TREE_CHILDREN (node) = vNULL;
+  SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
+  SLP_TREE_TWO_OPERATORS (node) = false;
+  SLP_TREE_DEF_TYPE (node) = vect_external_def;
+  node->refcnt = 1;
+  node->max_nunits = 1;
+
+  return node;
+}
+
 
 /* This structure is used in creation of an SLP tree.  Each instance
    corresponds to the same operand in a group of scalar stmts in an SLP
@@ -146,6 +170,8 @@ typedef struct _slp_oprnd_info
 {
   /* Def-stmts for the operands.  */
   vec<stmt_vec_info> def_stmts;
+  /* Operands.  */
+  vec<tree> ops;
   /* Information about the first statement, its vector def-type, type, the
      operand itself in case it's constant, and an indication if it's a pattern
      stmt.  */
@@ -169,6 +195,7 @@ vect_create_oprnd_info (int nops, int group_size)
     {
       oprnd_info = XNEW (struct _slp_oprnd_info);
       oprnd_info->def_stmts.create (group_size);
+      oprnd_info->ops.create (group_size);
       oprnd_info->first_dt = vect_uninitialized_def;
       oprnd_info->first_op_type = NULL_TREE;
       oprnd_info->any_pattern = false;
@@ -190,6 +217,7 @@ vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
     {
       oprnd_info->def_stmts.release ();
+      oprnd_info->ops.release ();
       XDELETE (oprnd_info);
     }
 
@@ -197,6 +225,19 @@ vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 }
 
 
+/* Return true if STMTS contains a pattern statement.  */
+
+static bool
+vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
+{
+  stmt_vec_info stmt_info;
+  unsigned int i;
+  FOR_EACH_VEC_ELT (stmts, i, stmt_info)
+    if (is_pattern_stmt_p (stmt_info))
+      return true;
+  return false;
+}
+
 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
    of the chain.  */
@@ -231,7 +272,8 @@ vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
    (if nonnull).  */
 
 bool
-can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
+can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
+				machine_mode elt_mode,
 				unsigned int *nvectors_out,
 				tree *vector_type_out,
 				tree *permutes)
@@ -243,7 +285,7 @@ can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
     {
       scalar_int_mode int_mode;
       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
-      if (multiple_p (current_vector_size, elt_bytes, &nelts)
+      if (multiple_p (GET_MODE_SIZE (vinfo->vector_mode), elt_bytes, &nelts)
 	  && int_mode_for_size (elt_bits, 0).exists (&int_mode))
 	{
 	  tree int_type = build_nonstandard_integer_type
@@ -322,6 +364,14 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
 	{
 	  internal_fn ifn = gimple_call_internal_fn (stmt);
 	  commutative_op = first_commutative_argument (ifn);
+
+	  /* Masked load, only look at mask.  */
+	  if (ifn == IFN_MASK_LOAD)
+	    {
+	      number_of_oprnds = 1;
+	      /* Mask operand index.  */
+	      first_op_idx = 5;
+	    }
 	}
     }
   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
@@ -380,6 +430,13 @@ again:
 
       if (first)
 	{
+	  /* For the swapping logic below force vect_reduction_def
+	     for the reduction op in a SLP reduction group.  */
+	  if (!STMT_VINFO_DATA_REF (stmt_info)
+	      && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+	      && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
+	      && def_stmt_info)
+	    dt = vect_reduction_def;
 	  oprnd_info->first_dt = dt;
 	  oprnd_info->first_op_type = TREE_TYPE (oprnd);
 	}
@@ -389,20 +446,35 @@ again:
 	     the def-stmt/s of the first stmt.  Allow different definition
 	     types for reduction chains: the first stmt must be a
 	     vect_reduction_def (a phi node), and the rest
-	     vect_internal_def.  */
+	     end in the reduction chain.  */
 	  tree type = TREE_TYPE (oprnd);
 	  if ((oprnd_info->first_dt != dt
 	       && !(oprnd_info->first_dt == vect_reduction_def
-		    && dt == vect_internal_def)
+		    && !STMT_VINFO_DATA_REF (stmt_info)
+		    && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+		    && def_stmt_info
+		    && !STMT_VINFO_DATA_REF (def_stmt_info)
+		    && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
+			== REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 	       && !((oprnd_info->first_dt == vect_external_def
 		     || oprnd_info->first_dt == vect_constant_def)
 		    && (dt == vect_external_def
 			|| dt == vect_constant_def)))
-	      || !types_compatible_p (oprnd_info->first_op_type, type))
+	      || !types_compatible_p (oprnd_info->first_op_type, type)
+	      || (!STMT_VINFO_DATA_REF (stmt_info)
+		  && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+		  && ((!def_stmt_info
+		       || STMT_VINFO_DATA_REF (def_stmt_info)
+		       || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
+			   != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
+		      != (oprnd_info->first_dt != vect_reduction_def))))
 	    {
 	      /* Try swapping operands if we got a mismatch.  */
 	      if (i == commutative_op && !swapped)
 		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_NOTE, vect_location,
+				     "trying swapped operands\n");
 		  swapped = true;
 		  goto again;
 		}
@@ -415,9 +487,9 @@ again:
 	    }
 	  if ((dt == vect_constant_def
 	       || dt == vect_external_def)
-	      && !current_vector_size.is_constant ()
+	      && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
 	      && (TREE_CODE (type) == BOOLEAN_TYPE
-		  || !can_duplicate_and_interleave_p (stmts.length (),
+		  || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
 						      TYPE_MODE (type))))
 	    {
 	      if (dump_enabled_p ())
@@ -431,14 +503,37 @@ again:
       /* Check the types of the definitions.  */
       switch (dt)
 	{
-	case vect_constant_def:
 	case vect_external_def:
+	  /* Make sure to demote the overall operand to external.  */
+	  oprnd_info->first_dt = vect_external_def;
+	  /* Fallthru.  */
+	case vect_constant_def:
+	  oprnd_info->def_stmts.quick_push (NULL);
+	  oprnd_info->ops.quick_push (oprnd);
 	  break;
 
+	case vect_internal_def:
 	case vect_reduction_def:
+	  if (oprnd_info->first_dt == vect_reduction_def
+	      && !STMT_VINFO_DATA_REF (stmt_info)
+	      && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+	      && !STMT_VINFO_DATA_REF (def_stmt_info)
+	      && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
+		  == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
+	    {
+	      /* For a SLP reduction chain we want to duplicate the
+	         reduction to each of the chain members.  That gets
+		 us a sane SLP graph (still the stmts are not 100%
+		 correct wrt the initial values).  */
+	      gcc_assert (!first);
+	      oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]);
+	      oprnd_info->ops.quick_push (oprnd_info->ops[0]);
+	      break;
+	    }
+	  /* Fallthru.  */
 	case vect_induction_def:
-	case vect_internal_def:
 	  oprnd_info->def_stmts.quick_push (def_stmt_info);
+	  oprnd_info->ops.quick_push (oprnd);
 	  break;
 
 	default:
@@ -468,6 +563,8 @@ again:
 
       if (first_op_cond)
 	{
+	  /* To get rid of this swapping we have to move the stmt code
+	     to the SLP tree as well (and gather it here per stmt).  */
 	  gassign *stmt = as_a <gassign *> (stmt_info->stmt);
 	  tree cond = gimple_assign_rhs1 (stmt);
 	  enum tree_code code = TREE_CODE (cond);
@@ -492,10 +589,8 @@ again:
 	}
       else
 	{
-	  unsigned int op = commutative_op + first_op_idx;
-	  swap_ssa_operands (stmt_info->stmt,
-			     gimple_op_ptr (stmt_info->stmt, op),
-			     gimple_op_ptr (stmt_info->stmt, op + 1));
+	  /* Commutative ops need not reflect swapping, ops are in
+	     the SLP tree.  */
 	}
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location,
@@ -620,7 +715,7 @@ vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
    is false then this indicates the comparison could not be
    carried out or the stmts will never be vectorized by SLP.
 
-   Note COND_EXPR is possibly ismorphic to another one after swapping its
+   Note COND_EXPR is possibly isomorphic to another one after swapping its
    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
    the first stmt by swapping the two operands of comparison; set SWAP[i]
    to 2 if stmt I is isormorphic to the first stmt by inverting the code
@@ -1030,7 +1125,6 @@ vect_build_slp_tree_2 (vec_info *vinfo,
 		       vec<stmt_vec_info> stmts, unsigned int group_size,
 		       poly_uint64 *max_nunits,
 		       bool *matches, unsigned *npermutes, unsigned *tree_size,
-		       unsigned max_tree_size,
 		       scalar_stmts_to_slp_tree_map_t *bst_map);
 
 static slp_tree
@@ -1038,7 +1132,6 @@ vect_build_slp_tree (vec_info *vinfo,
 		     vec<stmt_vec_info> stmts, unsigned int group_size,
 		     poly_uint64 *max_nunits,
 		     bool *matches, unsigned *npermutes, unsigned *tree_size,
-		     unsigned max_tree_size,
 		     scalar_stmts_to_slp_tree_map_t *bst_map)
 {
   if (slp_tree *leader = bst_map->get (stmts))
@@ -1056,8 +1149,7 @@ vect_build_slp_tree (vec_info *vinfo,
   poly_uint64 this_max_nunits = 1;
   slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
 					&this_max_nunits,
-					matches, npermutes, tree_size,
-					max_tree_size, bst_map);
+					matches, npermutes, tree_size, bst_map);
   if (res)
     {
       res->max_nunits = this_max_nunits;
@@ -1081,7 +1173,6 @@ vect_build_slp_tree_2 (vec_info *vinfo,
 		       vec<stmt_vec_info> stmts, unsigned int group_size,
 		       poly_uint64 *max_nunits,
 		       bool *matches, unsigned *npermutes, unsigned *tree_size,
-		       unsigned max_tree_size,
 		       scalar_stmts_to_slp_tree_map_t *bst_map)
 {
   unsigned nops, i, this_tree_size = 0;
@@ -1109,7 +1200,7 @@ vect_build_slp_tree_2 (vec_info *vinfo,
   if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
     {
       tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
-      tree vectype = get_vectype_for_scalar_type (scalar_type);
+      tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
       if (!vect_record_max_nunits (stmt_info, group_size, vectype, max_nunits))
 	return NULL;
 
@@ -1129,18 +1220,12 @@ vect_build_slp_tree_2 (vec_info *vinfo,
 	  /* Else def types have to match.  */
 	  stmt_vec_info other_info;
 	  FOR_EACH_VEC_ELT (stmts, i, other_info)
-	    {
-	      /* But for reduction chains only check on the first stmt.  */
-	      if (!STMT_VINFO_DATA_REF (other_info)
-		  && REDUC_GROUP_FIRST_ELEMENT (other_info)
-		  && REDUC_GROUP_FIRST_ELEMENT (other_info) != stmt_info)
-		continue;
-	      if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
-		return NULL;
-	    }
+	    if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
+	      return NULL;
 	}
       else
 	return NULL;
+      (*tree_size)++;
       node = vect_create_new_slp_node (stmts);
       return node;
     }
@@ -1152,13 +1237,23 @@ vect_build_slp_tree_2 (vec_info *vinfo,
 			      &this_max_nunits, matches, &two_operators))
     return NULL;
 
-  /* If the SLP node is a load, terminate the recursion.  */
+  /* If the SLP node is a load, terminate the recursion unless masked.  */
   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
     {
-      *max_nunits = this_max_nunits;
-      node = vect_create_new_slp_node (stmts);
-      return node;
+      if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
+	{
+	  /* Masked load.  */
+	  gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
+	  nops = 1;
+	}
+      else
+	{
+	  *max_nunits = this_max_nunits;
+	  (*tree_size)++;
+	  node = vect_create_new_slp_node (stmts);
+	  return node;
+	}
     }
 
   /* Get at the operands, verifying they are compatible.  */
@@ -1184,9 +1279,6 @@ vect_build_slp_tree_2 (vec_info *vinfo,
 
   stmt_info = stmts[0];
 
-  if (tree_size)
-    max_tree_size -= *tree_size;
-
   /* Create SLP_TREE nodes for the definition node/s.  */
   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
     {
@@ -1194,32 +1286,34 @@ vect_build_slp_tree_2 (vec_info *vinfo,
       unsigned old_tree_size = this_tree_size;
       unsigned int j;
 
+      if (oprnd_info->first_dt == vect_uninitialized_def)
+	{
+	  /* COND_EXPR have one too many eventually if the condition
+	     is a SSA name.  */
+	  gcc_assert (i == 3 && nops == 4);
+	  continue;
+	}
+
       if (oprnd_info->first_dt != vect_internal_def
 	  && oprnd_info->first_dt != vect_reduction_def
 	  && oprnd_info->first_dt != vect_induction_def)
-        continue;
-
-      if (++this_tree_size > max_tree_size)
 	{
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-			     vect_location,
-			     "Build SLP failed: SLP tree too large\n");
-	  FOR_EACH_VEC_ELT (children, j, child)
-	    vect_free_slp_tree (child, false);
-	  vect_free_oprnd_info (oprnds_info);
-	  return NULL;
+	  slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
+	  SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
+	  oprnd_info->ops = vNULL;
+	  children.safe_push (invnode);
+	  continue;
 	}
 
       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
 					group_size, &this_max_nunits,
 					matches, npermutes,
-					&this_tree_size,
-					max_tree_size, bst_map)) != NULL)
+					&this_tree_size, bst_map)) != NULL)
 	{
 	  /* If we have all children of child built up from scalars then just
 	     throw that away and build it up this node from scalars.  */
-	  if (!SLP_TREE_CHILDREN (child).is_empty ()
+	  if (is_a <bb_vec_info> (vinfo)
+	      && !SLP_TREE_CHILDREN (child).is_empty ()
 	      /* ???  Rejecting patterns this way doesn't work.  We'd have to
 		 do extra work to cancel the pattern so the uses see the
 		 scalar version.  */
@@ -1244,6 +1338,9 @@ vect_build_slp_tree_2 (vec_info *vinfo,
 				     "scalars instead\n");
 		  oprnd_info->def_stmts = vNULL;
 		  SLP_TREE_DEF_TYPE (child) = vect_external_def;
+		  SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
+		  oprnd_info->ops = vNULL;
+		  ++this_tree_size;
 		  children.safe_push (child);
 		  continue;
 		}
@@ -1273,9 +1370,12 @@ vect_build_slp_tree_2 (vec_info *vinfo,
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_NOTE, vect_location,
 			     "Building vector operands from scalars\n");
+	  this_tree_size++;
 	  child = vect_create_new_slp_node (oprnd_info->def_stmts);
 	  SLP_TREE_DEF_TYPE (child) = vect_external_def;
+	  SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
 	  children.safe_push (child);
+	  oprnd_info->ops = vNULL;
 	  oprnd_info->def_stmts = vNULL;
 	  continue;
 	}
@@ -1355,6 +1455,8 @@ vect_build_slp_tree_2 (vec_info *vinfo,
 	      {
 		std::swap (oprnds_info[0]->def_stmts[j],
 			   oprnds_info[1]->def_stmts[j]);
+		std::swap (oprnds_info[0]->ops[j],
+			   oprnds_info[1]->ops[j]);
 		if (dump_enabled_p ())
 		  dump_printf (MSG_NOTE, "%d ", j);
 	      }
@@ -1365,37 +1467,12 @@ vect_build_slp_tree_2 (vec_info *vinfo,
 	  if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
 					    group_size, &this_max_nunits,
 					    tem, npermutes,
-					    &this_tree_size,
-					    max_tree_size, bst_map)) != NULL)
+					    &this_tree_size, bst_map)) != NULL)
 	    {
-	      /* ... so if successful we can apply the operand swapping
-		 to the GIMPLE IL.  This is necessary because for example
-		 vect_get_slp_defs uses operand indexes and thus expects
-		 canonical operand order.  This is also necessary even
-		 if we end up building the operand from scalars as
-		 we'll continue to process swapped operand two.  */
-	      for (j = 0; j < group_size; ++j)
-		gimple_set_plf (stmts[j]->stmt, GF_PLF_1, false);
-	      for (j = 0; j < group_size; ++j)
-		if (matches[j] == !swap_not_matching)
-		  {
-		    gassign *stmt = as_a <gassign *> (stmts[j]->stmt);
-		    /* Avoid swapping operands twice.  */
-		    if (gimple_plf (stmt, GF_PLF_1))
-		      continue;
-		    swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
-				       gimple_assign_rhs2_ptr (stmt));
-		    gimple_set_plf (stmt, GF_PLF_1, true);
-		  }
-	      /* Verify we swap all duplicates or none.  */
-	      if (flag_checking)
-		for (j = 0; j < group_size; ++j)
-		  gcc_assert (gimple_plf (stmts[j]->stmt, GF_PLF_1)
-			      == (matches[j] == !swap_not_matching));
-
 	      /* If we have all children of child built up from scalars then
 		 just throw that away and build it up this node from scalars.  */
-	      if (!SLP_TREE_CHILDREN (child).is_empty ()
+	      if (is_a <bb_vec_info> (vinfo)
+		  && !SLP_TREE_CHILDREN (child).is_empty ()
 		  /* ???  Rejecting patterns this way doesn't work.  We'd have
 		     to do extra work to cancel the pattern so the uses see the
 		     scalar version.  */
@@ -1421,6 +1498,9 @@ vect_build_slp_tree_2 (vec_info *vinfo,
 					 "scalars instead\n");
 		      oprnd_info->def_stmts = vNULL;
 		      SLP_TREE_DEF_TYPE (child) = vect_external_def;
+		      SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
+		      oprnd_info->ops = vNULL;
+		      ++this_tree_size;
 		      children.safe_push (child);
 		      continue;
 		    }
@@ -1444,8 +1524,7 @@ fail:
 
   vect_free_oprnd_info (oprnds_info);
 
-  if (tree_size)
-    *tree_size += this_tree_size;
+  *tree_size += this_tree_size + 1;
   *max_nunits = this_max_nunits;
 
   node = vect_create_new_slp_node (stmts);
@@ -1460,9 +1539,10 @@ static void
 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
 		     slp_tree node, hash_set<slp_tree> &visited)
 {
-  int i;
+  unsigned i;
   stmt_vec_info stmt_info;
   slp_tree child;
+  tree op;
 
   if (visited.add (node))
     return;
@@ -1470,11 +1550,23 @@ vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
   dump_user_location_t user_loc = loc.get_user_location ();
   dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u)\n",
-		   SLP_TREE_DEF_TYPE (node) != vect_internal_def
-		   ? " (external)" : "", node,
+		   SLP_TREE_DEF_TYPE (node) == vect_external_def
+		   ? " (external)"
+		   : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
+		      ? " (constant)"
+		      : ""), node,
 		   estimated_poly_value (node->max_nunits));
-  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
-    dump_printf_loc (metadata, user_loc, "\tstmt %d %G", i, stmt_info->stmt);
+  if (SLP_TREE_SCALAR_STMTS (node).exists ())
+    FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
+      dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
+  else
+    {
+      dump_printf_loc (metadata, user_loc, "\t{ ");
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
+	dump_printf (metadata, "%T%s ", op,
+		     i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
+      dump_printf (metadata, "}\n");
+    }
   if (SLP_TREE_CHILDREN (node).is_empty ())
     return;
   dump_printf_loc (metadata, user_loc, "\tchildren");
@@ -1563,8 +1655,6 @@ vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
                           vec<unsigned> permutation,
 			  hash_set<slp_tree> &visited)
 {
-  stmt_vec_info stmt_info;
-  vec<stmt_vec_info> tmp_stmts;
   unsigned int i;
   slp_tree child;
 
@@ -1574,15 +1664,30 @@ vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
     vect_slp_rearrange_stmts (child, group_size, permutation, visited);
 
-  gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
-  tmp_stmts.create (group_size);
-  tmp_stmts.quick_grow_cleared (group_size);
-
-  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
-    tmp_stmts[permutation[i]] = stmt_info;
-
-  SLP_TREE_SCALAR_STMTS (node).release ();
-  SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
+  if (SLP_TREE_SCALAR_STMTS (node).exists ())
+    {
+      gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
+      vec<stmt_vec_info> tmp_stmts;
+      tmp_stmts.create (group_size);
+      tmp_stmts.quick_grow (group_size);
+      stmt_vec_info stmt_info;
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
+	tmp_stmts[permutation[i]] = stmt_info;
+      SLP_TREE_SCALAR_STMTS (node).release ();
+      SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
+    }
+  if (SLP_TREE_SCALAR_OPS (node).exists ())
+    {
+      gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
+      vec<tree> tmp_ops;
+      tmp_ops.create (group_size);
+      tmp_ops.quick_grow (group_size);
+      tree op;
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
+	tmp_ops[permutation[i]] = op;
+      SLP_TREE_SCALAR_OPS (node).release ();
+      SLP_TREE_SCALAR_OPS (node) = tmp_ops;
+    }
 }
 
 
@@ -1668,9 +1773,10 @@ vect_gather_slp_loads (slp_instance inst, slp_tree node,
 
   if (SLP_TREE_CHILDREN (node).length () == 0)
     {
+      if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
+	return;
       stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
-      if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
-	  && STMT_VINFO_GROUPED_ACCESS (stmt_info)
+      if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
 	  && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
 	SLP_INSTANCE_LOADS (inst).safe_push (node);
     }
@@ -1913,7 +2019,7 @@ vect_analyze_slp_instance (vec_info *vinfo,
   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
     {
       scalar_type = TREE_TYPE (DR_REF (dr));
-      vectype = get_vectype_for_scalar_type (scalar_type);
+      vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
       group_size = DR_GROUP_SIZE (stmt_info);
     }
   else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
@@ -1964,7 +2070,8 @@ vect_analyze_slp_instance (vec_info *vinfo,
       /* Mark the first element of the reduction chain as reduction to properly
 	 transform the node.  In the reduction analysis phase only the last
 	 element of the chain is marked as reduction.  */
-      STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
+      STMT_VINFO_DEF_TYPE (stmt_info)
+	= STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
 	= STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
     }
@@ -1982,9 +2089,10 @@ vect_analyze_slp_instance (vec_info *vinfo,
   scalar_stmts_to_slp_tree_map_t *bst_map
     = new scalar_stmts_to_slp_tree_map_t ();
   poly_uint64 max_nunits = nunits;
+  unsigned tree_size = 0;
   node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
 			      &max_nunits, matches, &npermutes,
-			      NULL, max_tree_size, bst_map);
+			      &tree_size, bst_map);
   /* The map keeps a reference on SLP nodes built, release that.  */
   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
        it != bst_map->end (); ++it)
@@ -1993,6 +2101,34 @@ vect_analyze_slp_instance (vec_info *vinfo,
   delete bst_map;
   if (node != NULL)
     {
+      /* If this is a reduction chain with a conversion in front
+         amend the SLP tree with a node for that.  */
+      if (!dr
+	  && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+	  && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
+	{
+	  /* Get at the conversion stmt - we know it's the single use
+	     of the last stmt of the reduction chain.  */
+	  gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
+	  use_operand_p use_p;
+	  gimple *use_stmt;
+	  bool r = single_imm_use (gimple_assign_lhs (tem), &use_p, &use_stmt);
+	  gcc_assert (r);
+	  next_info = vinfo->lookup_stmt (use_stmt);
+	  next_info = vect_stmt_to_vectorize (next_info);
+	  scalar_stmts = vNULL;
+	  scalar_stmts.create (group_size);
+	  for (unsigned i = 0; i < group_size; ++i)
+	    scalar_stmts.quick_push (next_info);
+	  slp_tree conv = vect_create_new_slp_node (scalar_stmts);
+	  SLP_TREE_CHILDREN (conv).quick_push (node);
+	  node = conv;
+	  /* We also have to fake this conversion stmt as SLP reduction group
+	     so we don't have to mess with too much code elsewhere.  */
+	  REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
+	  REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
+	}
+
       /* Calculate the unrolling factor based on the smallest type.  */
       poly_uint64 unrolling_factor
 	= calculate_unrolling_factor (max_nunits, group_size);
@@ -2025,6 +2161,10 @@ vect_analyze_slp_instance (vec_info *vinfo,
 	  SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
 	  SLP_INSTANCE_LOADS (new_instance) = vNULL;
 	  vect_gather_slp_loads (new_instance, node);
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "SLP size %u vs. limit %u.\n",
+			     tree_size, max_tree_size);
 
 	  /* Compute the load permutation.  */
 	  slp_tree load_node;
@@ -2231,8 +2371,11 @@ vect_make_slp_decision (loop_vec_info loop_vinfo)
   FOR_EACH_VEC_ELT (slp_instances, i, instance)
     {
       /* FORNOW: SLP if you can.  */
-      /* All unroll factors have the form current_vector_size * X for some
-	 rational X, so they must have a common multiple.  */
+      /* All unroll factors have the form:
+
+	   GET_MODE_SIZE (vinfo->vector_mode) * X
+
+	 for some rational X, so they must have a common multiple.  */
       unrolling_factor
 	= force_common_multiple (unrolling_factor,
 				 SLP_INSTANCE_UNROLLING_FACTOR (instance));
@@ -2327,7 +2470,8 @@ vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype,
 
   if (!only_edge)
     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
-      if (SLP_TREE_DEF_TYPE (child) != vect_external_def)
+      if (SLP_TREE_DEF_TYPE (child) != vect_external_def
+	  && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
 	vect_detect_hybrid_slp_stmts (child, i, stype, visited);
 }
 
@@ -2514,8 +2658,15 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
      VF divided by the number of elements in a vector.  */
   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
       && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
-    SLP_TREE_NUMBER_OF_VEC_STMTS (node)
-      = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
+    {
+      for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
+	if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
+	  {
+	    SLP_TREE_NUMBER_OF_VEC_STMTS (node)
+	      = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
+	    break;
+	  }
+    }
   else
     {
       poly_uint64 vf;
@@ -2533,6 +2684,39 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
   return vect_analyze_stmt (stmt_info, &dummy, node, node_instance, cost_vec);
 }
 
+/* Try to build NODE from scalars, returning true on success.
+   NODE_INSTANCE is the SLP instance that contains NODE.  */
+
+static bool
+vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
+			      slp_instance node_instance)
+{
+  stmt_vec_info stmt_info;
+  unsigned int i;
+
+  if (!is_a <bb_vec_info> (vinfo)
+      || node == SLP_INSTANCE_TREE (node_instance)
+      || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
+    return false;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "Building vector operands from scalars instead\n");
+
+  /* Don't remove and free the child nodes here, since they could be
+     referenced by other structures.  The analysis and scheduling phases
+     (need to) ignore child nodes of anything that isn't vect_internal_def.  */
+  unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
+  SLP_TREE_DEF_TYPE (node) = vect_external_def;
+  SLP_TREE_SCALAR_OPS (node).safe_grow (group_size);
+  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
+    {
+      tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
+      SLP_TREE_SCALAR_OPS (node)[i] = lhs;
+    }
+  return true;
+}
+
 /* Analyze statements contained in SLP tree NODE after recursively analyzing
    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
 
@@ -2559,6 +2743,13 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
     {
       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
 	= SLP_TREE_NUMBER_OF_VEC_STMTS (*leader);
+      /* Cope with cases in which we made a late decision to build the
+	 node from scalars.  */
+      if (SLP_TREE_DEF_TYPE (*leader) == vect_external_def
+	  && vect_slp_convert_to_external (vinfo, node, node_instance))
+	;
+      else
+	gcc_assert (SLP_TREE_DEF_TYPE (node) == SLP_TREE_DEF_TYPE (*leader));
       return true;
     }
 
@@ -2579,25 +2770,31 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
   auto_vec<vect_def_type, 4> dt;
   dt.safe_grow (SLP_TREE_CHILDREN (node).length ());
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
-    dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]);
+    if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
+      dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]);
 
   /* Push SLP node def-type to stmt operands.  */
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
-    if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
+    if (SLP_TREE_DEF_TYPE (child) != vect_internal_def
+	&& SLP_TREE_SCALAR_STMTS (child).length () != 0)
       STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
 	= SLP_TREE_DEF_TYPE (child);
 
   /* Check everything worked out.  */
   bool res = true;
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
-    if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
-      {
-	if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
-	    != SLP_TREE_DEF_TYPE (child))
-	  res = false;
-      }
-    else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) != dt[j])
-      res = false;
+      if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
+	{
+	  if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
+	    {
+	      if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
+		  != SLP_TREE_DEF_TYPE (child))
+		res = false;
+	    }
+	  else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
+		   != dt[j])
+	    res = false;
+	}
   if (!res && dump_enabled_p ())
     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 		     "not vectorized: same operand with different "
@@ -2609,7 +2806,13 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
 
   /* Restore def-types.  */
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
-    STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j];
+    if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
+      STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j];
+
+  /* If this node can't be vectorized, try pruning the tree here rather
+     than felling the whole thing.  */
+  if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
+    res = true;
 
   return res;
 }
@@ -2818,19 +3021,17 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
   return true;
 }
 
-/* Check if the basic block can be vectorized.  Returns a bb_vec_info
-   if so and sets fatal to true if failure is independent of
-   current_vector_size.  */
+/* Check if the region described by BB_VINFO can be vectorized, returning
+   true if so.  When returning false, set FATAL to true if the same failure
+   would prevent vectorization at other vector sizes, false if it is still
+   worth trying other sizes.  N_STMTS is the number of statements in the
+   region.  */
 
-static bb_vec_info
-vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
-		       gimple_stmt_iterator region_end,
-		       vec<data_reference_p> datarefs, int n_stmts,
-		       bool &fatal, vec_info_shared *shared)
+static bool
+vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
 {
   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
 
-  bb_vec_info bb_vinfo;
   slp_instance instance;
   int i;
   poly_uint64 min_vf = 2;
@@ -2838,34 +3039,15 @@ vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
   /* The first group of checks is independent of the vector size.  */
   fatal = true;
 
-  if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "not vectorized: too many instructions in "
-			 "basic block.\n");
-      free_data_refs (datarefs);
-      return NULL;
-    }
-
-  bb_vinfo = new _bb_vec_info (region_begin, region_end, shared);
-  if (!bb_vinfo)
-    return NULL;
-
-  BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
-  bb_vinfo->shared->save_datarefs ();
-
   /* Analyze the data references.  */
 
-  if (!vect_analyze_data_refs (bb_vinfo, &min_vf))
+  if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
     {
       if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			 "not vectorized: unhandled data-ref in basic "
 			 "block.\n");
-
-      delete bb_vinfo;
-      return NULL;
+      return false;
     }
 
   if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
@@ -2874,9 +3056,7 @@ vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			 "not vectorized: not enough data-refs in "
 			 "basic block.\n");
-
-      delete bb_vinfo;
-      return NULL;
+      return false;
     }
 
   if (!vect_analyze_data_ref_accesses (bb_vinfo))
@@ -2885,9 +3065,7 @@ vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			"not vectorized: unhandled data access in "
 			"basic block.\n");
-
-      delete bb_vinfo;
-      return NULL;
+      return false;
     }
 
   /* If there are no grouped stores in the region there is no need
@@ -2899,9 +3077,7 @@ vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			 "not vectorized: no grouped stores in "
 			 "basic block.\n");
-
-      delete bb_vinfo;
-      return NULL;
+      return false;
     }
 
   /* While the rest of the analysis below depends on it in some way.  */
@@ -2921,9 +3097,7 @@ vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
 			   "not vectorized: failed to find SLP opportunities "
 			   "in basic block.\n");
 	}
-
-      delete bb_vinfo;
-      return NULL;
+      return false;
     }
 
   vect_record_base_alignments (bb_vinfo);
@@ -2954,19 +3128,14 @@ vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
       i++;
     }
   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
-    {
-      delete bb_vinfo;
-      return NULL;
-    }
+    return false;
 
   if (!vect_slp_analyze_operations (bb_vinfo))
     {
       if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			 "not vectorized: bad operation in basic block.\n");
-
-      delete bb_vinfo;
-      return NULL;
+      return false;
     }
 
   /* Cost model: check if the vectorization is worthwhile.  */
@@ -2977,80 +3146,61 @@ vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			 "not vectorized: vectorization is not "
 			 "profitable.\n");
-
-      delete bb_vinfo;
-      return NULL;
+      return false;
     }
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
 		     "Basic block will be vectorized using SLP\n");
-
-  return bb_vinfo;
+  return true;
 }
 
+/* Subroutine of vect_slp_bb.  Try to vectorize the statements between
+   REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
+   on success.  The region has N_STMTS statements and has the datarefs
+   given by DATAREFS.  */
 
-/* Main entry for the BB vectorizer.  Analyze and transform BB, returns
-   true if anything in the basic-block was vectorized.  */
-
-bool
-vect_slp_bb (basic_block bb)
+static bool
+vect_slp_bb_region (gimple_stmt_iterator region_begin,
+		    gimple_stmt_iterator region_end,
+		    vec<data_reference_p> datarefs,
+		    unsigned int n_stmts)
 {
   bb_vec_info bb_vinfo;
-  gimple_stmt_iterator gsi;
-  bool any_vectorized = false;
-  auto_vector_sizes vector_sizes;
+  auto_vector_modes vector_modes;
 
   /* Autodetect first vector size we try.  */
-  current_vector_size = 0;
-  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
-  unsigned int next_size = 0;
+  machine_mode next_vector_mode = VOIDmode;
+  targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
+  unsigned int mode_i = 0;
 
-  gsi = gsi_start_bb (bb);
+  vec_info_shared shared;
 
-  poly_uint64 autodetected_vector_size = 0;
+  machine_mode autodetected_vector_mode = VOIDmode;
   while (1)
     {
-      if (gsi_end_p (gsi))
-	break;
-
-      gimple_stmt_iterator region_begin = gsi;
-      vec<data_reference_p> datarefs = vNULL;
-      int insns = 0;
-
-      for (; !gsi_end_p (gsi); gsi_next (&gsi))
-	{
-	  gimple *stmt = gsi_stmt (gsi);
-	  if (is_gimple_debug (stmt))
-	    continue;
-	  insns++;
-
-	  if (gimple_location (stmt) != UNKNOWN_LOCATION)
-	    vect_location = stmt;
-
-	  if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
-	    break;
-	}
-
-      /* Skip leading unhandled stmts.  */
-      if (gsi_stmt (region_begin) == gsi_stmt (gsi))
-	{
-	  gsi_next (&gsi);
-	  continue;
-	}
-
-      gimple_stmt_iterator region_end = gsi;
-
       bool vectorized = false;
       bool fatal = false;
-      vec_info_shared shared;
-      bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
-					datarefs, insns, fatal, &shared);
-      if (bb_vinfo
+      bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
+
+      bool first_time_p = shared.datarefs.is_empty ();
+      BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
+      if (first_time_p)
+	bb_vinfo->shared->save_datarefs ();
+      else
+	bb_vinfo->shared->check_datarefs ();
+      bb_vinfo->vector_mode = next_vector_mode;
+
+      if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal)
 	  && dbg_cnt (vect_slp))
 	{
 	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
+	    {
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "***** Analysis succeeded with vector mode"
+			       " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
+	      dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
+	    }
 
 	  bb_vinfo->shared->check_datarefs ();
 	  vect_schedule_slp (bb_vinfo);
@@ -3058,7 +3208,7 @@ vect_slp_bb (basic_block bb)
 	  unsigned HOST_WIDE_INT bytes;
 	  if (dump_enabled_p ())
 	    {
-	      if (current_vector_size.is_constant (&bytes))
+	      if (GET_MODE_SIZE (bb_vinfo->vector_mode).is_constant (&bytes))
 		dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
 				 "basic block part vectorized using %wu byte "
 				 "vectors\n", bytes);
@@ -3070,50 +3220,120 @@ vect_slp_bb (basic_block bb)
 
 	  vectorized = true;
 	}
-      delete bb_vinfo;
+      else
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "***** Analysis failed with vector mode %s\n",
+			     GET_MODE_NAME (bb_vinfo->vector_mode));
+	}
 
-      any_vectorized |= vectorized;
+      if (mode_i == 0)
+	autodetected_vector_mode = bb_vinfo->vector_mode;
 
-      if (next_size == 0)
-	autodetected_vector_size = current_vector_size;
+      if (!fatal)
+	while (mode_i < vector_modes.length ()
+	       && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
+	  {
+	    if (dump_enabled_p ())
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "***** The result for vector mode %s would"
+			       " be the same\n",
+			       GET_MODE_NAME (vector_modes[mode_i]));
+	    mode_i += 1;
+	  }
 
-      if (next_size < vector_sizes.length ()
-	  && known_eq (vector_sizes[next_size], autodetected_vector_size))
-	next_size += 1;
+      delete bb_vinfo;
+
+      if (mode_i < vector_modes.length ()
+	  && VECTOR_MODE_P (autodetected_vector_mode)
+	  && (related_vector_mode (vector_modes[mode_i],
+				   GET_MODE_INNER (autodetected_vector_mode))
+	      == autodetected_vector_mode)
+	  && (related_vector_mode (autodetected_vector_mode,
+				   GET_MODE_INNER (vector_modes[mode_i]))
+	      == vector_modes[mode_i]))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "***** Skipping vector mode %s, which would"
+			     " repeat the analysis for %s\n",
+			     GET_MODE_NAME (vector_modes[mode_i]),
+			     GET_MODE_NAME (autodetected_vector_mode));
+	  mode_i += 1;
+	}
 
       if (vectorized
-	  || next_size == vector_sizes.length ()
-	  || known_eq (current_vector_size, 0U)
+	  || mode_i == vector_modes.length ()
+	  || autodetected_vector_mode == VOIDmode
 	  /* If vect_slp_analyze_bb_1 signaled that analysis for all
 	     vector sizes will fail do not bother iterating.  */
 	  || fatal)
+	return vectorized;
+
+      /* Try the next biggest vector size.  */
+      next_vector_mode = vector_modes[mode_i++];
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "***** Re-trying analysis with vector mode %s\n",
+			 GET_MODE_NAME (next_vector_mode));
+    }
+}
+
+/* Main entry for the BB vectorizer.  Analyze and transform BB, returns
+   true if anything in the basic-block was vectorized.  */
+
+bool
+vect_slp_bb (basic_block bb)
+{
+  gimple_stmt_iterator gsi;
+  bool any_vectorized = false;
+
+  gsi = gsi_start_bb (bb);
+  while (!gsi_end_p (gsi))
+    {
+      gimple_stmt_iterator region_begin = gsi;
+      vec<data_reference_p> datarefs = vNULL;
+      int insns = 0;
+
+      for (; !gsi_end_p (gsi); gsi_next (&gsi))
 	{
-	  if (gsi_end_p (region_end))
+	  gimple *stmt = gsi_stmt (gsi);
+	  if (is_gimple_debug (stmt))
+	    continue;
+	  insns++;
+
+	  if (gimple_location (stmt) != UNKNOWN_LOCATION)
+	    vect_location = stmt;
+
+	  if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
 	    break;
+	}
 
-	  /* Skip the unhandled stmt.  */
+      /* Skip leading unhandled stmts.  */
+      if (gsi_stmt (region_begin) == gsi_stmt (gsi))
+	{
 	  gsi_next (&gsi);
-
-	  /* And reset vector sizes.  */
-	  current_vector_size = 0;
-	  next_size = 0;
+	  continue;
 	}
-      else
+
+      gimple_stmt_iterator region_end = gsi;
+
+      if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
 	{
-	  /* Try the next biggest vector size.  */
-	  current_vector_size = vector_sizes[next_size++];
 	  if (dump_enabled_p ())
-	    {
-	      dump_printf_loc (MSG_NOTE, vect_location,
-			       "***** Re-trying analysis with "
-			       "vector size ");
-	      dump_dec (MSG_NOTE, current_vector_size);
-	      dump_printf (MSG_NOTE, "\n");
-	    }
-
-	  /* Start over.  */
-	  gsi = region_begin;
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "not vectorized: too many instructions in "
+			     "basic block.\n");
 	}
+      else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
+	any_vectorized = true;
+
+      if (gsi_end_p (region_end))
+	break;
+
+      /* Skip the unhandled stmt.  */
+      gsi_next (&gsi);
     }
 
   return any_vectorized;
@@ -3184,8 +3404,9 @@ vect_mask_constant_operand_p (stmt_vec_info stmt_vinfo)
    to cut down on the number of interleaves.  */
 
 void
-duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
-			  unsigned int nresults, vec<tree> &results)
+duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
+			  vec<tree> elts, unsigned int nresults,
+			  vec<tree> &results)
 {
   unsigned int nelts = elts.length ();
   tree element_type = TREE_TYPE (vector_type);
@@ -3194,7 +3415,7 @@ duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
   unsigned int nvectors = 1;
   tree new_vector_type;
   tree permutes[2];
-  if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type),
+  if (!can_duplicate_and_interleave_p (vinfo, nelts, TYPE_MODE (element_type),
 				       &nvectors, &new_vector_type,
 				       permutes))
     gcc_unreachable ();
@@ -3276,52 +3497,45 @@ duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
 
 /* For constant and loop invariant defs of SLP_NODE this function returns
    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
-   OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
-   scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
-   REDUC_INDEX is the index of the reduction operand in the statements, unless
-   it is -1.  */
+   OP_NODE determines the node for the operand containing the scalar
+   operands.  */
 
 static void
-vect_get_constant_vectors (tree op, slp_tree slp_node,
-                           vec<tree> *vec_oprnds,
-			   unsigned int op_num, unsigned int number_of_vectors)
+vect_get_constant_vectors (slp_tree op_node, slp_tree slp_node,
+                           vec<tree> *vec_oprnds)
 {
-  vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
-  stmt_vec_info stmt_vinfo = stmts[0];
-  gimple *stmt = stmt_vinfo->stmt;
+  stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (slp_node)[0];
+  vec_info *vinfo = stmt_vinfo->vinfo;
   unsigned HOST_WIDE_INT nunits;
   tree vec_cst;
   unsigned j, number_of_places_left_in_vector;
   tree vector_type;
   tree vop;
-  int group_size = stmts.length ();
+  int group_size = op_node->ops.length ();
   unsigned int vec_num, i;
   unsigned number_of_copies = 1;
-  vec<tree> voprnds;
-  voprnds.create (number_of_vectors);
-  bool constant_p, is_store;
+  bool constant_p;
   tree neutral_op = NULL;
-  enum tree_code code = gimple_expr_code (stmt);
   gimple_seq ctor_seq = NULL;
   auto_vec<tree, 16> permute_results;
 
+  /* ???  SLP analysis should compute the vector type for the
+     constant / invariant and store it in the SLP node.  */
+  tree op = op_node->ops[0];
   /* Check if vector type is a boolean vector.  */
+  tree stmt_vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
   if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
       && vect_mask_constant_operand_p (stmt_vinfo))
-    vector_type
-      = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
-  else
-    vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
-
-  if (STMT_VINFO_DATA_REF (stmt_vinfo))
-    {
-      is_store = true;
-      op = gimple_assign_rhs1 (stmt);
-    }
+    vector_type = truth_type_for (stmt_vectype);
   else
-    is_store = false;
+    vector_type = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op));
 
-  gcc_assert (op);
+  unsigned int number_of_vectors
+    = vect_get_num_vectors (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
+			    * TYPE_VECTOR_SUBPARTS (stmt_vectype),
+			    vector_type);
+  vec_oprnds->create (number_of_vectors);
+  auto_vec<tree> voprnds (number_of_vectors);
 
   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
      created vectors. It is greater than 1 if unrolling is performed.
@@ -3353,56 +3567,8 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
   bool place_after_defs = false;
   for (j = 0; j < number_of_copies; j++)
     {
-      for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
+      for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
         {
-	  stmt = stmt_vinfo->stmt;
-          if (is_store)
-            op = gimple_assign_rhs1 (stmt);
-          else
-	    {
-	      switch (code)
-		{
-		  case COND_EXPR:
-		    {
-		      tree cond = gimple_assign_rhs1 (stmt);
-		      if (TREE_CODE (cond) == SSA_NAME)
-			op = gimple_op (stmt, op_num + 1);
-		      else if (op_num == 0 || op_num == 1)
-			op = TREE_OPERAND (cond, op_num);
-		      else
-			{
-			  if (op_num == 2)
-			    op = gimple_assign_rhs2 (stmt);
-			  else
-			    op = gimple_assign_rhs3 (stmt);
-			}
-		    }
-		    break;
-
-		  case CALL_EXPR:
-		    op = gimple_call_arg (stmt, op_num);
-		    break;
-
-		  case LSHIFT_EXPR:
-		  case RSHIFT_EXPR:
-		  case LROTATE_EXPR:
-		  case RROTATE_EXPR:
-		    op = gimple_op (stmt, op_num + 1);
-		    /* Unlike the other binary operators, shifts/rotates have
-		       the shift count being int, instead of the same type as
-		       the lhs, so make sure the scalar is the right type if
-		       we are dealing with vectors of
-		       long long/long/short/char.  */
-		    if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
-		      op = fold_convert (TREE_TYPE (vector_type), op);
-		    break;
-
-		  default:
-		    op = gimple_op (stmt, op_num + 1);
-		    break;
-		}
-	    }
-
           /* Create 'vect_ = {op0,op1,...,opn}'.  */
           number_of_places_left_in_vector--;
 	  tree orig_op = op;
@@ -3472,9 +3638,9 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
 		vec_cst = gimple_build_vector (&ctor_seq, &elts);
 	      else
 		{
-		  if (vec_oprnds->is_empty ())
-		    duplicate_and_interleave (&ctor_seq, vector_type, elts,
-					      number_of_vectors,
+		  if (permute_results.is_empty ())
+		    duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
+					      elts, number_of_vectors,
 					      permute_results);
 		  vec_cst = permute_results[number_of_vectors - j - 1];
 		}
@@ -3516,8 +3682,6 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
       vec_oprnds->quick_push (vop);
     }
 
-  voprnds.release ();
-
   /* In case that VF is greater than the unrolling factor needed for the SLP
      group of stmts, NUMBER_OF_VECTORS to be created is greater than
      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
@@ -3548,25 +3712,17 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
 static void
 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
 {
-  tree vec_oprnd;
   stmt_vec_info vec_def_stmt_info;
   unsigned int i;
 
   gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
 
   FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt_info)
-    {
-      gcc_assert (vec_def_stmt_info);
-      if (gphi *vec_def_phi = dyn_cast <gphi *> (vec_def_stmt_info->stmt))
-	vec_oprnd = gimple_phi_result (vec_def_phi);
-      else
-	vec_oprnd = gimple_get_lhs (vec_def_stmt_info->stmt);
-      vec_oprnds->quick_push (vec_oprnd);
-    }
+    vec_oprnds->quick_push (gimple_get_lhs (vec_def_stmt_info->stmt));
 }
 
 
-/* Get vectorized definitions for SLP_NODE.
+/* Get N vectorized definitions for SLP_NODE.
    If the scalar definitions are loop invariants or constants, collect them and
    call vect_get_constant_vectors() to create vector stmts.
    Otherwise, the def-stmts must be already vectorized and the vectorized stmts
@@ -3574,91 +3730,26 @@ vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
    vect_get_slp_vect_defs () to retrieve them.  */
 
 void
-vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
-		   vec<vec<tree> > *vec_oprnds)
+vect_get_slp_defs (slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
 {
-  int number_of_vects = 0, i;
-  unsigned int child_index = 0;
-  HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
-  slp_tree child = NULL;
-  vec<tree> vec_defs;
-  tree oprnd;
-  bool vectorized_defs;
+  if (n == -1U)
+    n = SLP_TREE_CHILDREN (slp_node).length ();
 
-  stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
-  FOR_EACH_VEC_ELT (ops, i, oprnd)
+  for (unsigned i = 0; i < n; ++i)
     {
-      /* For each operand we check if it has vectorized definitions in a child
-	 node or we need to create them (for invariants and constants).  We
-	 check if the LHS of the first stmt of the next child matches OPRND.
-	 If it does, we found the correct child.  Otherwise, we call
-	 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
-	 to check this child node for the next operand.  */
-      vectorized_defs = false;
-      if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
-        {
-          child = SLP_TREE_CHILDREN (slp_node)[child_index];
-
-	  /* We have to check both pattern and original def, if available.  */
-	  if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
-	    {
-	      stmt_vec_info first_def_info = SLP_TREE_SCALAR_STMTS (child)[0];
-	      stmt_vec_info related = STMT_VINFO_RELATED_STMT (first_def_info);
-	      tree first_def_op;
-
-	      if (gphi *first_def = dyn_cast <gphi *> (first_def_info->stmt))
-		first_def_op = gimple_phi_result (first_def);
-	      else
-		first_def_op = gimple_get_lhs (first_def_info->stmt);
-	      if (operand_equal_p (oprnd, first_def_op, 0)
-		  || (related
-		      && operand_equal_p (oprnd,
-					  gimple_get_lhs (related->stmt), 0)))
-		{
-		  /* The number of vector defs is determined by the number of
-		     vector statements in the node from which we get those
-		     statements.  */
-		  number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
-		  vectorized_defs = true;
-		  child_index++;
-		}
-	    }
-	  else
-	    child_index++;
-        }
-
-      if (!vectorized_defs)
-        {
-          if (i == 0)
-            {
-              number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-              /* Number of vector stmts was calculated according to LHS in
-                 vect_schedule_slp_instance (), fix it by replacing LHS with
-                 RHS, if necessary.  See vect_get_smallest_scalar_type () for
-                 details.  */
-	      vect_get_smallest_scalar_type (first_stmt_info, &lhs_size_unit,
-					     &rhs_size_unit);
-              if (rhs_size_unit != lhs_size_unit)
-                {
-                  number_of_vects *= rhs_size_unit;
-                  number_of_vects /= lhs_size_unit;
-                }
-            }
-        }
+      slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
 
-      /* Allocate memory for vectorized defs.  */
-      vec_defs = vNULL;
-      vec_defs.create (number_of_vects);
+      vec<tree> vec_defs = vNULL;
 
-      /* For reduction defs we call vect_get_constant_vectors (), since we are
-         looking for initial loop invariant values.  */
-      if (vectorized_defs)
-        /* The defs are already vectorized.  */
-	vect_get_slp_vect_defs (child, &vec_defs);
+      /* For each operand we check if it has vectorized definitions in a child
+	 node or we need to create them (for invariants and constants).  */
+      if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
+	{
+	  vec_defs.create (SLP_TREE_NUMBER_OF_VEC_STMTS (child));
+	  vect_get_slp_vect_defs (child, &vec_defs);
+	}
       else
-	/* Build vectors from scalar defs.  */
-	vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
-				   number_of_vects);
+	vect_get_constant_vectors (child, slp_node, &vec_defs);
 
       vec_oprnds->quick_push (vec_defs);
     }
@@ -3939,17 +4030,6 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
   stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
   si = gsi_for_stmt (last_stmt_info->stmt);
 
-  /* Mark the first element of the reduction chain as reduction to properly
-     transform the node.  In the analysis phase only the last element of the
-     chain is marked as reduction.  */
-  if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
-      && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
-      && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
-    {
-      STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
-      STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
-    }
-
   /* Handle two-operation SLP nodes by vectorizing the group with
      both operations and then performing a merge.  */
   if (SLP_TREE_TWO_OPERATORS (node))
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 74abfbfe56e..5d6da3d9708 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -329,13 +329,13 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
 	  basic_block bb = gimple_bb (USE_STMT (use_p));
 	  if (!flow_bb_inside_loop_p (loop, bb))
 	    {
+	      if (is_gimple_debug (USE_STMT (use_p)))
+		continue;
+
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_NOTE, vect_location,
                                  "vec_stmt_relevant_p: used out of loop.\n");
 
-	      if (is_gimple_debug (USE_STMT (use_p)))
-		continue;
-
 	      /* We expect all such uses to be in the loop exit phis
 		 (because of loop closed form)   */
 	      gcc_assert (gimple_code (USE_STMT (use_p)) == GIMPLE_PHI);
@@ -456,7 +456,6 @@ process_use (stmt_vec_info stmt_vinfo, tree use, loop_vec_info loop_vinfo,
 	     bool force)
 {
   stmt_vec_info dstmt_vinfo;
-  basic_block bb, def_bb;
   enum vect_def_type dt;
 
   /* case 1: we are only interested in uses that need to be vectorized.  Uses
@@ -472,28 +471,8 @@ process_use (stmt_vec_info stmt_vinfo, tree use, loop_vec_info loop_vinfo,
   if (!dstmt_vinfo)
     return opt_result::success ();
 
-  def_bb = gimple_bb (dstmt_vinfo->stmt);
-
-  /* case 2: A reduction phi (STMT) defined by a reduction stmt (DSTMT_VINFO).
-     DSTMT_VINFO must have already been processed, because this should be the
-     only way that STMT, which is a reduction-phi, was put in the worklist,
-     as there should be no other uses for DSTMT_VINFO in the loop.  So we just
-     check that everything is as expected, and we are done.  */
-  bb = gimple_bb (stmt_vinfo->stmt);
-  if (gimple_code (stmt_vinfo->stmt) == GIMPLE_PHI
-      && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
-      && gimple_code (dstmt_vinfo->stmt) != GIMPLE_PHI
-      && STMT_VINFO_DEF_TYPE (dstmt_vinfo) == vect_reduction_def
-      && bb->loop_father == def_bb->loop_father)
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_NOTE, vect_location,
-                         "reduc-stmt defining reduc-phi in the same nest.\n");
-      gcc_assert (STMT_VINFO_RELEVANT (dstmt_vinfo) < vect_used_by_reduction);
-      gcc_assert (STMT_VINFO_LIVE_P (dstmt_vinfo)
-		  || STMT_VINFO_RELEVANT (dstmt_vinfo) > vect_unused_in_scope);
-      return opt_result::success ();
-    }
+  basic_block def_bb = gimple_bb (dstmt_vinfo->stmt);
+  basic_block bb = gimple_bb (stmt_vinfo->stmt);
 
   /* case 3a: outer-loop stmt defining an inner-loop stmt:
 	outer-loop-header-bb:
@@ -607,7 +586,7 @@ process_use (stmt_vec_info stmt_vinfo, tree use, loop_vec_info loop_vinfo,
    This pass detects such stmts.  */
 
 opt_result
-vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
+vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo, bool *fatal)
 {
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
@@ -777,7 +756,11 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
 	    = process_use (stmt_vinfo, gs_info.offset, loop_vinfo, relevant,
 			   &worklist, true);
 	  if (!res)
-	    return res;
+	    {
+	      if (fatal)
+		*fatal = false;
+	      return res;
+	    }
 	}
     } /* while worklist */
 
@@ -791,6 +774,7 @@ vect_prologue_cost_for_slp_op (slp_tree node, stmt_vec_info stmt_info,
 			       unsigned opno, enum vect_def_type dt,
 			       stmt_vector_for_cost *cost_vec)
 {
+  vec_info *vinfo = stmt_info->vinfo;
   gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0]->stmt;
   tree op = gimple_op (stmt, opno);
   unsigned prologue_cost = 0;
@@ -798,7 +782,7 @@ vect_prologue_cost_for_slp_op (slp_tree node, stmt_vec_info stmt_info,
   /* Without looking at the actual initializer a vector of
      constants can be implemented as load from the constant pool.
      When all elements are the same we can use a splat.  */
-  tree vectype = get_vectype_for_scalar_type (TREE_TYPE (op));
+  tree vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op));
   unsigned group_size = SLP_TREE_SCALAR_STMTS (node).length ();
   unsigned num_vects_to_check;
   unsigned HOST_WIDE_INT const_nunits;
@@ -1603,9 +1587,9 @@ vect_get_vec_def_for_operand (tree op, stmt_vec_info stmt_vinfo, tree vectype)
 	vector_type = vectype;
       else if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
 	       && VECTOR_BOOLEAN_TYPE_P (stmt_vectype))
-	vector_type = build_same_sized_truth_vector_type (stmt_vectype);
+	vector_type = truth_type_for (stmt_vectype);
       else
-	vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
+	vector_type = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op));
 
       gcc_assert (vector_type);
       return vect_init_vector (stmt_vinfo, op, vector_type, NULL);
@@ -1720,16 +1704,8 @@ vect_get_vec_defs (tree op0, tree op1, stmt_vec_info stmt_info,
 {
   if (slp_node)
     {
-      int nops = (op1 == NULL_TREE) ? 1 : 2;
-      auto_vec<tree> ops (nops);
-      auto_vec<vec<tree> > vec_defs (nops);
-
-      ops.quick_push (op0);
-      if (op1)
-        ops.quick_push (op1);
-
-      vect_get_slp_defs (ops, slp_node, &vec_defs);
-
+      auto_vec<vec<tree> > vec_defs (SLP_TREE_CHILDREN (slp_node).length ());
+      vect_get_slp_defs (slp_node, &vec_defs, op1 ? 2 : 1);
       *vec_oprnds0 = vec_defs[0];
       if (op1)
 	*vec_oprnds1 = vec_defs[1];
@@ -1874,7 +1850,8 @@ static tree permute_vec_elements (tree, tree, tree, stmt_vec_info,
    says how the load or store is going to be implemented and GROUP_SIZE
    is the number of load or store statements in the containing group.
    If the access is a gather load or scatter store, GS_INFO describes
-   its arguments.
+   its arguments.  If the load or store is conditional, SCALAR_MASK is the
+   condition under which it occurs.
 
    Clear LOOP_VINFO_CAN_FULLY_MASK_P if a fully-masked loop is not
    supported, otherwise record the required mask types.  */
@@ -1883,7 +1860,7 @@ static void
 check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
 			  vec_load_store_type vls_type, int group_size,
 			  vect_memory_access_type memory_access_type,
-			  gather_scatter_info *gs_info)
+			  gather_scatter_info *gs_info, tree scalar_mask)
 {
   /* Invariant loads need no special support.  */
   if (memory_access_type == VMAT_INVARIANT)
@@ -1907,7 +1884,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
 	  return;
 	}
       unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype);
-      vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype);
+      vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask);
       return;
     }
 
@@ -1931,7 +1908,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
 	  return;
 	}
       unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype);
-      vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype);
+      vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask);
       return;
     }
 
@@ -1949,9 +1926,8 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
     }
 
   machine_mode mask_mode;
-  if (!(targetm.vectorize.get_mask_mode
-	(GET_MODE_NUNITS (vecmode),
-	 GET_MODE_SIZE (vecmode)).exists (&mask_mode))
+  if (!VECTOR_MODE_P (vecmode)
+      || !targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode)
       || !can_vec_mask_load_store_p (vecmode, mask_mode, is_load))
     {
       if (dump_enabled_p ())
@@ -1969,7 +1945,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   unsigned int nvectors;
   if (can_div_away_from_zero_p (group_size * vf, nunits, &nvectors))
-    vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype);
+    vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask);
   else
     gcc_unreachable ();
 }
@@ -2311,6 +2287,29 @@ get_group_load_store_type (stmt_vec_info stmt_info, tree vectype, bool slp,
 	      && gap < (vect_known_alignment_in_bytes (first_dr_info)
 			/ vect_get_scalar_dr_size (first_dr_info)))
 	    overrun_p = false;
+
+	  /* If the gap splits the vector in half and the target
+	     can do half-vector operations avoid the epilogue peeling
+	     by simply loading half of the vector only.  Usually
+	     the construction with an upper zero half will be elided.  */
+	  dr_alignment_support alignment_support_scheme;
+	  scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vectype));
+	  machine_mode vmode;
+	  if (overrun_p
+	      && !masked_p
+	      && (((alignment_support_scheme
+		      = vect_supportable_dr_alignment (first_dr_info, false)))
+		   == dr_aligned
+		  || alignment_support_scheme == dr_unaligned_supported)
+	      && known_eq (nunits, (group_size - gap) * 2)
+	      && known_eq (nunits, group_size)
+	      && related_vector_mode (TYPE_MODE (vectype), elmode,
+				      group_size - gap).exists (&vmode)
+	      && (convert_optab_handler (vec_init_optab,
+					 TYPE_MODE (vectype), vmode)
+		  != CODE_FOR_nothing))
+	    overrun_p = false;
+
 	  if (overrun_p && !can_overrun_p)
 	    {
 	      if (dump_enabled_p ())
@@ -2536,6 +2535,7 @@ vect_check_load_store_mask (stmt_vec_info stmt_info, tree mask,
 			    vect_def_type *mask_dt_out,
 			    tree *mask_vectype_out)
 {
+  vec_info *vinfo = stmt_info->vinfo;
   if (!VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (mask)))
     {
       if (dump_enabled_p ())
@@ -2564,7 +2564,7 @@ vect_check_load_store_mask (stmt_vec_info stmt_info, tree mask,
 
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   if (!mask_vectype)
-    mask_vectype = get_mask_type_for_scalar_type (TREE_TYPE (vectype));
+    mask_vectype = get_mask_type_for_scalar_type (vinfo, TREE_TYPE (vectype));
 
   if (!mask_vectype || !VECTOR_BOOLEAN_TYPE_P (mask_vectype))
     {
@@ -2728,7 +2728,7 @@ vect_build_gather_load_calls (stmt_vec_info stmt_info,
 			   || TREE_CODE (masktype) == INTEGER_TYPE
 			   || types_compatible_p (srctype, masktype)));
   if (mask && TREE_CODE (masktype) == INTEGER_TYPE)
-    masktype = build_same_sized_truth_vector_type (srctype);
+    masktype = truth_type_for (srctype);
 
   tree mask_halftype = masktype;
   tree perm_mask = NULL_TREE;
@@ -2774,8 +2774,7 @@ vect_build_gather_load_calls (stmt_vec_info stmt_info,
 	  mask_perm_mask = vect_gen_perm_mask_checked (masktype, indices);
 	}
       else if (mask)
-	mask_halftype
-	  = build_same_sized_truth_vector_type (gs_info->offset_vectype);
+	mask_halftype = truth_type_for (gs_info->offset_vectype);
     }
   else
     gcc_unreachable ();
@@ -2952,6 +2951,7 @@ vect_get_gather_scatter_ops (struct loop *loop, stmt_vec_info stmt_info,
 			     gather_scatter_info *gs_info,
 			     tree *dataref_ptr, tree *vec_offset)
 {
+  vec_info *vinfo = stmt_info->vinfo;
   gimple_seq stmts = NULL;
   *dataref_ptr = force_gimple_operand (gs_info->base, &stmts, true, NULL_TREE);
   if (stmts != NULL)
@@ -2962,7 +2962,7 @@ vect_get_gather_scatter_ops (struct loop *loop, stmt_vec_info stmt_info,
       gcc_assert (!new_bb);
     }
   tree offset_type = TREE_TYPE (gs_info->offset);
-  tree offset_vectype = get_vectype_for_scalar_type (offset_type);
+  tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
   *vec_offset = vect_get_vec_def_for_operand (gs_info->offset, stmt_info,
 					      offset_vectype);
 }
@@ -2997,7 +2997,7 @@ vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
   /* The offset given in GS_INFO can have pointer type, so use the element
      type of the vector instead.  */
   tree offset_type = TREE_TYPE (gs_info->offset);
-  tree offset_vectype = get_vectype_for_scalar_type (offset_type);
+  tree offset_vectype = get_vectype_for_scalar_type (loop_vinfo, offset_type);
   offset_type = TREE_TYPE (offset_vectype);
 
   /* Calculate X = DR_STEP / SCALE and convert it to the appropriate type.  */
@@ -3161,8 +3161,7 @@ simple_integer_narrowing (tree vectype_out, tree vectype_in,
   int multi_step_cvt = 0;
   auto_vec <tree, 8> interm_types;
   if (!supportable_narrowing_operation (NOP_EXPR, vectype_out, vectype_in,
-					&code, &multi_step_cvt,
-					&interm_types)
+					&code, &multi_step_cvt, &interm_types)
       || multi_step_cvt)
     return false;
 
@@ -3295,10 +3294,10 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	  return false;
 	}
     }
-  /* If all arguments are external or constant defs use a vector type with
-     the same size as the output vector type.  */
+  /* If all arguments are external or constant defs, infer the vector type
+     from the scalar type.  */
   if (!vectype_in)
-    vectype_in = get_same_sized_vectype (rhs_type, vectype_out);
+    vectype_in = get_vectype_for_scalar_type (vinfo, rhs_type);
   if (vec_stmt)
     gcc_assert (vectype_in);
   if (!vectype_in)
@@ -3309,6 +3308,19 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 
       return false;
     }
+  /* FORNOW: we don't yet support mixtures of vector sizes for calls,
+     just mixtures of nunits.  E.g. DI->SI versions of __builtin_ctz*
+     are traditionally vectorized as two VnDI->VnDI IFN_CTZs followed
+     by a pack of the two vectors into an SI vector.  We would need
+     separate code to handle direct VnDI->VnSI IFN_CTZs.  */
+  if (TYPE_SIZE (vectype_in) != TYPE_SIZE (vectype_out))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "mismatched vector sizes %T and %T\n",
+			 vectype_in, vectype_out);
+      return false;
+    }
 
   /* FORNOW */
   nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
@@ -3415,7 +3427,9 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	  unsigned int nvectors = (slp_node
 				   ? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
 				   : ncopies);
-	  vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_out);
+	  tree scalar_mask = gimple_call_arg (stmt_info->stmt, mask_opno);
+	  vect_record_loop_mask (loop_vinfo, masks, nvectors,
+				 vectype_out, scalar_mask);
 	}
       return true;
     }
@@ -3446,9 +3460,7 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	      auto_vec<vec<tree> > vec_defs (nargs);
 	      vec<tree> vec_oprnds0;
 
-	      for (i = 0; i < nargs; i++)
-		vargs[i] = gimple_call_arg (stmt, i);
-	      vect_get_slp_defs (vargs, slp_node, &vec_defs);
+	      vect_get_slp_defs (slp_node, &vec_defs);
 	      vec_oprnds0 = vec_defs[0];
 
 	      /* Arguments are ready.  Create the new vector stmt.  */
@@ -3470,8 +3482,7 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 			= gimple_build_call_internal_vec (ifn, vargs);
 		      gimple_call_set_lhs (call, half_res);
 		      gimple_call_set_nothrow (call, true);
-		      new_stmt_info
-			= vect_finish_stmt_generation (stmt_info, call, gsi);
+		      vect_finish_stmt_generation (stmt_info, call, gsi);
 		      if ((i & 1) == 0)
 			{
 			  prev_res = half_res;
@@ -3523,8 +3534,7 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	  if (mask_opno >= 0 && !vectypes[mask_opno])
 	    {
 	      gcc_assert (modifier != WIDEN);
-	      vectypes[mask_opno]
-		= build_same_sized_truth_vector_type (vectype_in);
+	      vectypes[mask_opno] = truth_type_for (vectype_in);
 	    }
 
 	  for (i = 0; i < nargs; i++)
@@ -3570,8 +3580,7 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	      gcall *call = gimple_build_call_internal_vec (ifn, vargs);
 	      gimple_call_set_lhs (call, half_res);
 	      gimple_call_set_nothrow (call, true);
-	      new_stmt_info
-		= vect_finish_stmt_generation (stmt_info, call, gsi);
+	      vect_finish_stmt_generation (stmt_info, call, gsi);
 	      if ((j & 1) == 0)
 		{
 		  prev_res = half_res;
@@ -3622,9 +3631,7 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	      auto_vec<vec<tree> > vec_defs (nargs);
 	      vec<tree> vec_oprnds0;
 
-	      for (i = 0; i < nargs; i++)
-		vargs.quick_push (gimple_call_arg (stmt, i));
-	      vect_get_slp_defs (vargs, slp_node, &vec_defs);
+	      vect_get_slp_defs (slp_node, &vec_defs);
 	      vec_oprnds0 = vec_defs[0];
 
 	      /* Arguments are ready.  Create the new vector stmt.  */
@@ -4087,9 +4094,8 @@ vectorizable_simd_clone_call (stmt_vec_info stmt_info,
 	 || arginfo[i].dt == vect_external_def)
 	&& bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
       {
-	arginfo[i].vectype
-	  = get_vectype_for_scalar_type (TREE_TYPE (gimple_call_arg (stmt,
-								     i)));
+	tree arg_type = TREE_TYPE (gimple_call_arg (stmt, i));
+	arginfo[i].vectype = get_vectype_for_scalar_type (vinfo, arg_type);
 	if (arginfo[i].vectype == NULL
 	    || (simd_clone_subparts (arginfo[i].vectype)
 		> bestn->simdclone->simdlen))
@@ -4802,10 +4808,10 @@ vectorizable_conversion (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	}
     }
 
-  /* If op0 is an external or constant defs use a vector type of
-     the same size as the output vector type.  */
+  /* If op0 is an external or constant def, infer the vector type
+     from the scalar type.  */
   if (!vectype_in)
-    vectype_in = get_same_sized_vectype (rhs_type, vectype_out);
+    vectype_in = get_vectype_for_scalar_type (vinfo, rhs_type);
   if (vec_stmt)
     gcc_assert (vectype_in);
   if (!vectype_in)
@@ -4863,7 +4869,9 @@ vectorizable_conversion (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   switch (modifier)
     {
     case NONE:
-      if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
+      if (code != FIX_TRUNC_EXPR
+	  && code != FLOAT_EXPR
+	  && !CONVERT_EXPR_CODE_P (code))
 	return false;
       if (supportable_convert_operation (code, vectype_out, vectype_in,
 					 &decl1, &code1))
@@ -5452,7 +5460,7 @@ vectorizable_assignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
    either as shift by a scalar or by a vector.  */
 
 bool
-vect_supportable_shift (enum tree_code code, tree scalar_type)
+vect_supportable_shift (vec_info *vinfo, enum tree_code code, tree scalar_type)
 {
 
   machine_mode vec_mode;
@@ -5460,7 +5468,7 @@ vect_supportable_shift (enum tree_code code, tree scalar_type)
   int icode;
   tree vectype;
 
-  vectype = get_vectype_for_scalar_type (scalar_type);
+  vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
   if (!vectype)
     return false;
 
@@ -5491,7 +5499,7 @@ vect_supportable_shift (enum tree_code code, tree scalar_type)
    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
    Return true if STMT_INFO is vectorizable in this way.  */
 
-bool
+static bool
 vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 		    stmt_vec_info *vec_stmt, slp_tree slp_node,
 		    stmt_vector_for_cost *cost_vec)
@@ -5524,6 +5532,7 @@ vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   bool scalar_shift_arg = true;
   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
   vec_info *vinfo = stmt_info->vinfo;
+  bool incompatible_op1_vectype_p = false;
 
   if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
     return false;
@@ -5565,10 +5574,10 @@ vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
                          "use not simple.\n");
       return false;
     }
-  /* If op0 is an external or constant def use a vector type with
-     the same size as the output vector type.  */
+  /* If op0 is an external or constant def, infer the vector type
+     from the scalar type.  */
   if (!vectype)
-    vectype = get_same_sized_vectype (TREE_TYPE (op0), vectype_out);
+    vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op0));
   if (vec_stmt)
     gcc_assert (vectype);
   if (!vectype)
@@ -5666,9 +5675,16 @@ vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
                          "vector/vector shift/rotate found.\n");
 
       if (!op1_vectype)
-	op1_vectype = get_same_sized_vectype (TREE_TYPE (op1), vectype_out);
-      if (op1_vectype == NULL_TREE
-	  || TYPE_MODE (op1_vectype) != TYPE_MODE (vectype))
+	op1_vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op1));
+      incompatible_op1_vectype_p
+	= (op1_vectype == NULL_TREE
+	   || maybe_ne (TYPE_VECTOR_SUBPARTS (op1_vectype),
+			TYPE_VECTOR_SUBPARTS (vectype))
+	   || TYPE_MODE (op1_vectype) != TYPE_MODE (vectype));
+      if (incompatible_op1_vectype_p
+	  && (!slp_node
+	      || SLP_TREE_DEF_TYPE
+		   (SLP_TREE_CHILDREN (slp_node)[1]) != vect_constant_def))
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -5707,7 +5723,10 @@ vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
                  so make sure the scalar is the right type if we are
 		 dealing with vectors of long long/long/short/char.  */
               if (dt[1] == vect_constant_def)
-                op1 = fold_convert (TREE_TYPE (vectype), op1);
+		{
+		  if (!slp_node)
+		    op1 = fold_convert (TREE_TYPE (vectype), op1);
+		}
 	      else if (!useless_type_conversion_p (TREE_TYPE (vectype),
 						   TREE_TYPE (op1)))
 		{
@@ -5818,6 +5837,21 @@ vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
                     }
                 }
             }
+	  else if (slp_node && incompatible_op1_vectype_p)
+	    {
+	      /* Convert the scalar constant shift amounts in-place.  */
+	      slp_tree shift = SLP_TREE_CHILDREN (slp_node)[1];
+	      gcc_assert (SLP_TREE_DEF_TYPE (shift) == vect_constant_def);
+	      for (unsigned i = 0;
+		   i < SLP_TREE_SCALAR_OPS (shift).length (); ++i)
+		{
+		  SLP_TREE_SCALAR_OPS (shift)[i]
+		    = fold_convert (TREE_TYPE (vectype),
+				    SLP_TREE_SCALAR_OPS (shift)[i]);
+		  gcc_assert ((TREE_CODE (SLP_TREE_SCALAR_OPS (shift)[i])
+			       == INTEGER_CST));
+		}
+	    }
 
           /* vec_oprnd1 is available if operand 1 should be of a scalar-type
              (a special case for certain kind of vector shifts); otherwise,
@@ -5894,7 +5928,7 @@ vectorizable_operation (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   poly_uint64 nunits_in;
   poly_uint64 nunits_out;
   tree vectype_out;
-  int ncopies;
+  int ncopies, vec_num;
   int j, i;
   vec<tree> vec_oprnds0 = vNULL;
   vec<tree> vec_oprnds1 = vNULL;
@@ -5964,8 +5998,8 @@ vectorizable_operation (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
                          "use not simple.\n");
       return false;
     }
-  /* If op0 is an external or constant def use a vector type with
-     the same size as the output vector type.  */
+  /* If op0 is an external or constant def, infer the vector type
+     from the scalar type.  */
   if (!vectype)
     {
       /* For boolean type we cannot determine vectype by
@@ -5985,7 +6019,7 @@ vectorizable_operation (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	  vectype = vectype_out;
 	}
       else
-	vectype = get_same_sized_vectype (TREE_TYPE (op0), vectype_out);
+	vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op0));
     }
   if (vec_stmt)
     gcc_assert (vectype);
@@ -6031,9 +6065,15 @@ vectorizable_operation (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
      vectorized stmts for each SLP node.  Hence, NCOPIES is always 1 in
      case of SLP.  */
   if (slp_node)
-    ncopies = 1;
+    {
+      ncopies = 1;
+      vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+    }
   else
-    ncopies = vect_get_num_copies (loop_vinfo, vectype);
+    {
+      ncopies = vect_get_num_copies (loop_vinfo, vectype);
+      vec_num = 1;
+    }
 
   gcc_assert (ncopies >= 1);
 
@@ -6086,8 +6126,34 @@ vectorizable_operation (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       return false;
     }
 
+  int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
+  vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
+  internal_fn cond_fn = get_conditional_internal_fn (code);
+
   if (!vec_stmt) /* transformation not required.  */
     {
+      /* If this operation is part of a reduction, a fully-masked loop
+	 should only change the active lanes of the reduction chain,
+	 keeping the inactive lanes as-is.  */
+      if (loop_vinfo
+	  && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
+	  && reduc_idx >= 0)
+	{
+	  if (cond_fn == IFN_LAST
+	      || !direct_internal_fn_supported_p (cond_fn, vectype,
+						  OPTIMIZE_FOR_SPEED))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "can't use a fully-masked loop because no"
+				 " conditional operation is available.\n");
+	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+	    }
+	  else
+	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
+				   vectype, NULL);
+	}
+
       STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
       DUMP_VECT_SCOPE ("vectorizable_operation");
       vect_model_simple_cost (stmt_info, ncopies, dt, ndts, slp_node, cost_vec);
@@ -6100,6 +6166,8 @@ vectorizable_operation (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
     dump_printf_loc (MSG_NOTE, vect_location,
                      "transform binary/unary operation.\n");
 
+  bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+
   /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
      vectors with unsigned elements, but the result is signed.  So, we
      need to compute the MINUS_EXPR into vectype temporary and
@@ -6180,12 +6248,8 @@ vectorizable_operation (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	    {
 	      if (slp_node)
 		{
-		  auto_vec<tree> ops(3);
-		  ops.quick_push (op0);
-		  ops.quick_push (op1);
-		  ops.quick_push (op2);
 		  auto_vec<vec<tree> > vec_defs(3);
-		  vect_get_slp_defs (ops, slp_node, &vec_defs);
+		  vect_get_slp_defs (slp_node, &vec_defs);
 		  vec_oprnds0 = vec_defs[0];
 		  vec_oprnds1 = vec_defs[1];
 		  vec_oprnds2 = vec_defs[2];
@@ -6221,22 +6285,41 @@ vectorizable_operation (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 		  ? vec_oprnds1[i] : NULL_TREE);
 	  vop2 = ((op_type == ternary_op)
 		  ? vec_oprnds2[i] : NULL_TREE);
-	  gassign *new_stmt = gimple_build_assign (vec_dest, code,
-						   vop0, vop1, vop2);
-	  new_temp = make_ssa_name (vec_dest, new_stmt);
-	  gimple_assign_set_lhs (new_stmt, new_temp);
-	  new_stmt_info
-	    = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
-	  if (vec_cvt_dest)
+	  if (masked_loop_p && reduc_idx >= 0)
 	    {
-	      new_temp = build1 (VIEW_CONVERT_EXPR, vectype_out, new_temp);
-	      gassign *new_stmt
-		= gimple_build_assign (vec_cvt_dest, VIEW_CONVERT_EXPR,
-				       new_temp);
-	      new_temp = make_ssa_name (vec_cvt_dest, new_stmt);
+	      /* Perform the operation on active elements only and take
+		 inactive elements from the reduction chain input.  */
+	      gcc_assert (!vop2);
+	      vop2 = reduc_idx == 1 ? vop1 : vop0;
+	      tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
+					      vectype, i * ncopies + j);
+	      gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
+							vop0, vop1, vop2);
+	      new_temp = make_ssa_name (vec_dest, call);
+	      gimple_call_set_lhs (call, new_temp);
+	      gimple_call_set_nothrow (call, true);
+	      new_stmt_info
+		= vect_finish_stmt_generation (stmt_info, call, gsi);
+	    }
+	  else
+	    {
+	      gassign *new_stmt = gimple_build_assign (vec_dest, code,
+						       vop0, vop1, vop2);
+	      new_temp = make_ssa_name (vec_dest, new_stmt);
 	      gimple_assign_set_lhs (new_stmt, new_temp);
 	      new_stmt_info
 		= vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+	      if (vec_cvt_dest)
+		{
+		  new_temp = build1 (VIEW_CONVERT_EXPR, vectype_out, new_temp);
+		  gassign *new_stmt
+		    = gimple_build_assign (vec_cvt_dest, VIEW_CONVERT_EXPR,
+					   new_temp);
+		  new_temp = make_ssa_name (vec_cvt_dest, new_stmt);
+		  gimple_assign_set_lhs (new_stmt, new_temp);
+		  new_stmt_info
+		    = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+		}
 	    }
           if (slp_node)
 	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
@@ -6517,7 +6600,7 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       if (loop_vinfo
 	  && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
 	check_load_store_masking (loop_vinfo, vectype, vls_type, group_size,
-				  memory_access_type, &gs_info);
+				  memory_access_type, &gs_info, mask);
 
       STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
       vect_model_store_cost (stmt_info, ncopies, rhs_dt, memory_access_type,
@@ -6580,8 +6663,7 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	  ncopies *= 2;
 
 	  if (mask)
-	    mask_halfvectype
-	      = build_same_sized_truth_vector_type (gs_info.offset_vectype);
+	    mask_halfvectype = truth_type_for (gs_info.offset_vectype);
 	}
       else
 	gcc_unreachable ();
@@ -6840,9 +6922,8 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 		 of vector elts directly.  */
 	      scalar_mode elmode = SCALAR_TYPE_MODE (elem_type);
 	      machine_mode vmode;
-	      if (!mode_for_vector (elmode, group_size).exists (&vmode)
-		  || !VECTOR_MODE_P (vmode)
-		  || !targetm.vector_mode_supported_p (vmode)
+	      if (!related_vector_mode (TYPE_MODE (vectype), elmode,
+					group_size).exists (&vmode)
 		  || (convert_optab_handler (vec_extract_optab,
 					     TYPE_MODE (vectype), vmode)
 		      == CODE_FOR_nothing))
@@ -6859,9 +6940,8 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 		     element extracts from the original vector type and
 		     element size stores.  */
 		  if (int_mode_for_size (lsize, 0).exists (&elmode)
-		      && mode_for_vector (elmode, lnunits).exists (&vmode)
-		      && VECTOR_MODE_P (vmode)
-		      && targetm.vector_mode_supported_p (vmode)
+		      && related_vector_mode (TYPE_MODE (vectype), elmode,
+					      lnunits).exists (&vmode)
 		      && (convert_optab_handler (vec_extract_optab,
 						 vmode, elmode)
 			  != CODE_FOR_nothing))
@@ -7624,14 +7704,6 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       if (!scalar_dest)
 	return false;
 
-      if (slp_node != NULL)
-	{
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "SLP of masked loads not supported.\n");
-	  return false;
-	}
-
       int mask_index = internal_fn_mask_index (ifn);
       if (mask_index >= 0)
 	{
@@ -7714,6 +7786,15 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
       group_size = DR_GROUP_SIZE (first_stmt_info);
 
+      /* Refuse non-SLP vectorization of SLP-only groups.  */
+      if (!slp && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "cannot vectorize load in non-SLP mode.\n");
+	  return false;
+	}
+
       if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
 	slp_perm = true;
 
@@ -7767,7 +7848,7 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       if (loop_vinfo
 	  && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
 	check_load_store_masking (loop_vinfo, vectype, VLS_LOAD, group_size,
-				  memory_access_type, &gs_info);
+				  memory_access_type, &gs_info, mask);
 
       STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
       vect_model_load_cost (stmt_info, ncopies, memory_access_type,
@@ -7947,9 +8028,8 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 		 vector elts directly.  */
 	      scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vectype));
 	      machine_mode vmode;
-	      if (mode_for_vector (elmode, group_size).exists (&vmode)
-		  && VECTOR_MODE_P (vmode)
-		  && targetm.vector_mode_supported_p (vmode)
+	      if (related_vector_mode (TYPE_MODE (vectype), elmode,
+				       group_size).exists (&vmode)
 		  && (convert_optab_handler (vec_init_optab,
 					     TYPE_MODE (vectype), vmode)
 		      != CODE_FOR_nothing))
@@ -7973,9 +8053,8 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 		  /* If we can't construct such a vector fall back to
 		     element loads of the original vector type.  */
 		  if (int_mode_for_size (lsize, 0).exists (&elmode)
-		      && mode_for_vector (elmode, lnunits).exists (&vmode)
-		      && VECTOR_MODE_P (vmode)
-		      && targetm.vector_mode_supported_p (vmode)
+		      && related_vector_mode (TYPE_MODE (vectype), elmode,
+					      lnunits).exists (&vmode)
 		      && (convert_optab_handler (vec_init_optab, vmode, elmode)
 			  != CODE_FOR_nothing))
 		    {
@@ -8413,8 +8492,17 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 					  simd_lane_access_p,
 					  byte_offset, bump);
 	  if (mask)
-	    vec_mask = vect_get_vec_def_for_operand (mask, stmt_info,
-						     mask_vectype);
+	    {
+	      if (slp_node)
+		{
+		  auto_vec<vec<tree> > vec_defs (1);
+		  vect_get_slp_defs (slp_node, &vec_defs);
+		  vec_mask = vec_defs[0][0];
+		}
+	      else
+		vec_mask = vect_get_vec_def_for_operand (mask, stmt_info,
+							 mask_vectype);
+	    }
 	}
       else
 	{
@@ -8564,8 +8652,25 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 		      }
 		    else
 		      {
+			tree ltype = vectype;
+			/* If there's no peeling for gaps but we have a gap
+			   with slp loads then load the lower half of the
+			   vector only.  See get_group_load_store_type for
+			   when we apply this optimization.  */
+			if (slp
+			    && loop_vinfo
+			    && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+			    && DR_GROUP_GAP (first_stmt_info) != 0
+			    && known_eq (nunits,
+					 (group_size
+					  - DR_GROUP_GAP (first_stmt_info)) * 2)
+			    && known_eq (nunits, group_size))
+			  ltype = build_vector_type (TREE_TYPE (vectype),
+						     (group_size
+						      - DR_GROUP_GAP
+						          (first_stmt_info)));
 			data_ref
-			  = fold_build2 (MEM_REF, vectype, dataref_ptr,
+			  = fold_build2 (MEM_REF, ltype, dataref_ptr,
 					 dataref_offset
 					 ? dataref_offset
 					 : build_int_cst (ref_type, 0));
@@ -8579,6 +8684,23 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 			  TREE_TYPE (data_ref)
 			    = build_aligned_type (TREE_TYPE (data_ref),
 						  TYPE_ALIGN (elem_type));
+			if (ltype != vectype)
+			  {
+			    vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
+			    tree tem = make_ssa_name (ltype);
+			    new_stmt = gimple_build_assign (tem, data_ref);
+			    vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+			    data_ref = NULL;
+			    vec<constructor_elt, va_gc> *v;
+			    vec_alloc (v, 2);
+			    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
+			    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
+						    build_zero_cst (ltype));
+			    new_stmt
+			      = gimple_build_assign (vec_dest,
+						     build_constructor
+						       (vectype, v));
+			  }
 		      }
 		    break;
 		  }
@@ -8864,7 +8986,7 @@ vect_is_simple_cond (tree cond, vec_info *vinfo,
 	scalar_type = build_nonstandard_integer_type
 	  (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))),
 	   TYPE_UNSIGNED (scalar_type));
-      *comp_vectype = get_vectype_for_scalar_type (scalar_type);
+      *comp_vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
     }
 
   return true;
@@ -8881,9 +9003,9 @@ vect_is_simple_cond (tree cond, vec_info *vinfo,
 
    Return true if STMT_INFO is vectorizable in this way.  */
 
-bool
+static bool
 vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
-			stmt_vec_info *vec_stmt, bool for_reduction,
+			stmt_vec_info *vec_stmt,
 			slp_tree slp_node, stmt_vector_for_cost *cost_vec)
 {
   vec_info *vinfo = stmt_info->vinfo;
@@ -8913,22 +9035,39 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   tree vec_cmp_type;
   bool masked = false;
 
-  if (for_reduction && STMT_SLP_TYPE (stmt_info))
+  if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
+    return false;
+
+  /* Is vectorizable conditional operation?  */
+  gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
+  if (!stmt)
+    return false;
+
+  code = gimple_assign_rhs_code (stmt);
+  if (code != COND_EXPR)
     return false;
 
-  vect_reduction_type reduction_type
-    = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
-  if (reduction_type == TREE_CODE_REDUCTION)
+  stmt_vec_info reduc_info = NULL;
+  int reduc_index = -1;
+  vect_reduction_type reduction_type = TREE_CODE_REDUCTION;
+  bool for_reduction
+    = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) != NULL;
+  if (for_reduction)
     {
-      if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
+      if (STMT_SLP_TYPE (stmt_info))
 	return false;
-
-      if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
-	  && !(STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
-	       && for_reduction))
+      reduc_info = info_for_reduction (stmt_info);
+      reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
+      reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
+      gcc_assert (reduction_type != EXTRACT_LAST_REDUCTION
+		  || reduc_index != -1);
+    }
+  else
+    {
+      if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
 	return false;
 
-      /* FORNOW: not yet supported.  */
+      /* FORNOW: only supported as part of a reduction.  */
       if (STMT_VINFO_LIVE_P (stmt_info))
 	{
 	  if (dump_enabled_p ())
@@ -8938,16 +9077,6 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	}
     }
 
-  /* Is vectorizable conditional operation?  */
-  gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
-  if (!stmt)
-    return false;
-
-  code = gimple_assign_rhs_code (stmt);
-
-  if (code != COND_EXPR)
-    return false;
-
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   tree vectype1 = NULL_TREE, vectype2 = NULL_TREE;
 
@@ -8981,7 +9110,7 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
     return false;
 
   masked = !COMPARISON_CLASS_P (cond_expr);
-  vec_cmp_type = build_same_sized_truth_vector_type (comp_vectype);
+  vec_cmp_type = truth_type_for (comp_vectype);
 
   if (vec_cmp_type == NULL_TREE)
     return false;
@@ -8993,6 +9122,29 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       cond_expr1 = TREE_OPERAND (cond_expr, 1);
     }
 
+  /* For conditional reductions, the "then" value needs to be the candidate
+     value calculated by this iteration while the "else" value needs to be
+     the result carried over from previous iterations.  If the COND_EXPR
+     is the other way around, we need to swap it.  */
+  bool must_invert_cmp_result = false;
+  if (reduction_type == EXTRACT_LAST_REDUCTION && reduc_index == 1)
+    {
+      if (masked)
+	must_invert_cmp_result = true;
+      else
+	{
+	  bool honor_nans = HONOR_NANS (TREE_TYPE (cond_expr0));
+	  tree_code new_code = invert_tree_comparison (cond_code, honor_nans);
+	  if (new_code == ERROR_MARK)
+	    must_invert_cmp_result = true;
+	  else
+	    cond_code = new_code;
+	}
+      /* Make sure we don't accidentally use the old condition.  */
+      cond_expr = NULL_TREE;
+      std::swap (then_clause, else_clause);
+    }
+
   if (!masked && VECTOR_BOOLEAN_TYPE_P (comp_vectype))
     {
       /* Boolean values may have another representation in vectors
@@ -9053,6 +9205,16 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 		return false;
 	    }
 	}
+      if (loop_vinfo
+	  && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
+	  && reduction_type == EXTRACT_LAST_REDUCTION)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "can't yet use a fully-masked loop for"
+			     " EXTRACT_LAST_REDUCTION.\n");
+	  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+	}
       if (expand_vec_cond_expr_p (vectype, comp_vectype,
 				     cond_code))
 	{
@@ -9082,24 +9244,42 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   /* Handle cond expr.  */
   for (j = 0; j < ncopies; j++)
     {
+      tree loop_mask = NULL_TREE;
+      bool swap_cond_operands = false;
+
+      /* See whether another part of the vectorized code applies a loop
+	 mask to the condition, or to its inverse.  */
+
+      if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+	{
+	  scalar_cond_masked_key cond (cond_expr, ncopies);
+	  if (loop_vinfo->scalar_cond_masked_set.contains (cond))
+	    {
+	      vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+	      loop_mask = vect_get_loop_mask (gsi, masks, ncopies, vectype, j);
+	    }
+	  else
+	    {
+	      bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0));
+	      cond.code = invert_tree_comparison (cond.code, honor_nans);
+	      if (loop_vinfo->scalar_cond_masked_set.contains (cond))
+		{
+		  vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+		  loop_mask = vect_get_loop_mask (gsi, masks, ncopies,
+						  vectype, j);
+		  cond_code = cond.code;
+		  swap_cond_operands = true;
+		}
+	    }
+	}
+
       stmt_vec_info new_stmt_info = NULL;
       if (j == 0)
 	{
           if (slp_node)
             {
-              auto_vec<tree, 4> ops;
 	      auto_vec<vec<tree>, 4> vec_defs;
-
-	      if (masked)
-		ops.safe_push (cond_expr);
-	      else
-		{
-		  ops.safe_push (cond_expr0);
-		  ops.safe_push (cond_expr1);
-		}
-              ops.safe_push (then_clause);
-              ops.safe_push (else_clause);
-              vect_get_slp_defs (ops, slp_node, &vec_defs);
+              vect_get_slp_defs (slp_node, &vec_defs);
 	      vec_oprnds3 = vec_defs.pop ();
 	      vec_oprnds2 = vec_defs.pop ();
 	      if (!masked)
@@ -9159,6 +9339,9 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
           vec_then_clause = vec_oprnds2[i];
           vec_else_clause = vec_oprnds3[i];
 
+	  if (swap_cond_operands)
+	    std::swap (vec_then_clause, vec_else_clause);
+
 	  if (masked)
 	    vec_compare = vec_cond_lhs;
 	  else
@@ -9197,6 +9380,50 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 		    }
 		}
 	    }
+
+	  /* If we decided to apply a loop mask to the result of the vector
+             comparison, AND the comparison with the mask now.  Later passes
+             should then be able to reuse the AND results between mulitple
+             vector statements.
+
+	     For example:
+	     for (int i = 0; i < 100; ++i)
+	       x[i] = y[i] ? z[i] : 10;
+
+	     results in following optimized GIMPLE:
+
+	     mask__35.8_43 = vect__4.7_41 != { 0, ... };
+	     vec_mask_and_46 = loop_mask_40 & mask__35.8_43;
+	     _19 = &MEM[base: z_12(D), index: ivtmp_56, step: 4, offset: 0B];
+	     vect_iftmp.11_47 = .MASK_LOAD (_19, 4B, vec_mask_and_46);
+	     vect_iftmp.12_52 = VEC_COND_EXPR <vec_mask_and_46,
+					       vect_iftmp.11_47, { 10, ... }>;
+
+	     instead of using a masked and unmasked forms of
+	     vec != { 0, ... } (masked in the MASK_LOAD,
+	     unmasked in the VEC_COND_EXPR).  */
+
+	  if (loop_mask)
+	    {
+	      if (COMPARISON_CLASS_P (vec_compare))
+		{
+		  tree tmp = make_ssa_name (vec_cmp_type);
+		  tree op0 = TREE_OPERAND (vec_compare, 0);
+		  tree op1 = TREE_OPERAND (vec_compare, 1);
+		  gassign *g = gimple_build_assign (tmp,
+						    TREE_CODE (vec_compare),
+						    op0, op1);
+		  vect_finish_stmt_generation (stmt_info, g, gsi);
+		  vec_compare = tmp;
+		}
+
+	      tree tmp2 = make_ssa_name (vec_cmp_type);
+	      gassign *g = gimple_build_assign (tmp2, BIT_AND_EXPR,
+						vec_compare, loop_mask);
+	      vect_finish_stmt_generation (stmt_info, g, gsi);
+	      vec_compare = tmp2;
+	    }
+
 	  if (reduction_type == EXTRACT_LAST_REDUCTION)
 	    {
 	      if (!is_gimple_val (vec_compare))
@@ -9207,6 +9434,15 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 		  vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
 		  vec_compare = vec_compare_name;
 		}
+	      if (must_invert_cmp_result)
+		{
+		  tree vec_compare_name = make_ssa_name (vec_cmp_type);
+		  gassign *new_stmt = gimple_build_assign (vec_compare_name,
+							   BIT_NOT_EXPR,
+							   vec_compare);
+		  vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+		  vec_compare = vec_compare_name;
+		}
 	      gcall *new_stmt = gimple_build_call_internal
 		(IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare,
 		 vec_then_clause);
@@ -9345,7 +9581,7 @@ vectorizable_comparison (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   /* Invariant comparison.  */
   if (!vectype)
     {
-      vectype = get_vectype_for_scalar_type (TREE_TYPE (rhs1));
+      vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (rhs1));
       if (maybe_ne (TYPE_VECTOR_SUBPARTS (vectype), nunits))
 	return false;
     }
@@ -9446,12 +9682,8 @@ vectorizable_comparison (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	{
 	  if (slp_node)
 	    {
-	      auto_vec<tree, 2> ops;
 	      auto_vec<vec<tree>, 2> vec_defs;
-
-	      ops.safe_push (rhs1);
-	      ops.safe_push (rhs2);
-	      vect_get_slp_defs (ops, slp_node, &vec_defs);
+	      vect_get_slp_defs (slp_node, &vec_defs);
 	      vec_oprnds1 = vec_defs.pop ();
 	      vec_oprnds0 = vec_defs.pop ();
 	      if (swap_p)
@@ -9544,7 +9776,8 @@ vectorizable_comparison (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 
 static bool
 can_vectorize_live_stmts (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
-			  slp_tree slp_node, stmt_vec_info *vec_stmt,
+			  slp_tree slp_node, slp_instance slp_node_instance,
+			  stmt_vec_info *vec_stmt,
 			  stmt_vector_for_cost *cost_vec)
 {
   if (slp_node)
@@ -9554,13 +9787,15 @@ can_vectorize_live_stmts (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, slp_stmt_info)
 	{
 	  if (STMT_VINFO_LIVE_P (slp_stmt_info)
-	      && !vectorizable_live_operation (slp_stmt_info, gsi, slp_node, i,
+	      && !vectorizable_live_operation (slp_stmt_info, gsi, slp_node,
+					       slp_node_instance, i,
 					       vec_stmt, cost_vec))
 	    return false;
 	}
     }
   else if (STMT_VINFO_LIVE_P (stmt_info)
-	   && !vectorizable_live_operation (stmt_info, gsi, slp_node, -1,
+	   && !vectorizable_live_operation (stmt_info, gsi, slp_node,
+					    slp_node_instance, -1,
 					    vec_stmt, cost_vec))
     return false;
 
@@ -9736,14 +9971,13 @@ vect_analyze_stmt (stmt_vec_info stmt_info, bool *need_to_vectorize,
 	  || vectorizable_load (stmt_info, NULL, NULL, node, node_instance,
 				cost_vec)
 	  || vectorizable_store (stmt_info, NULL, NULL, node, cost_vec)
-	  || vectorizable_reduction (stmt_info, NULL, NULL, node,
-				     node_instance, cost_vec)
+	  || vectorizable_reduction (stmt_info, node, node_instance, cost_vec)
 	  || vectorizable_induction (stmt_info, NULL, NULL, node, cost_vec)
 	  || vectorizable_shift (stmt_info, NULL, NULL, node, cost_vec)
-	  || vectorizable_condition (stmt_info, NULL, NULL, false, node,
-				     cost_vec)
+	  || vectorizable_condition (stmt_info, NULL, NULL, node, cost_vec)
 	  || vectorizable_comparison (stmt_info, NULL, NULL, node,
-				      cost_vec));
+				      cost_vec)
+	  || vectorizable_lc_phi (stmt_info, NULL, node));
   else
     {
       if (bb_vinfo)
@@ -9759,8 +9993,7 @@ vect_analyze_stmt (stmt_vec_info stmt_info, bool *need_to_vectorize,
 	      || vectorizable_load (stmt_info, NULL, NULL, node, node_instance,
 				    cost_vec)
 	      || vectorizable_store (stmt_info, NULL, NULL, node, cost_vec)
-	      || vectorizable_condition (stmt_info, NULL, NULL, false, node,
-					 cost_vec)
+	      || vectorizable_condition (stmt_info, NULL, NULL, node, cost_vec)
 	      || vectorizable_comparison (stmt_info, NULL, NULL, node,
 					  cost_vec));
     }
@@ -9775,7 +10008,9 @@ vect_analyze_stmt (stmt_vec_info stmt_info, bool *need_to_vectorize,
       need extra handling, except for vectorizable reductions.  */
   if (!bb_vinfo
       && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type
-      && !can_vectorize_live_stmts (stmt_info, NULL, node, NULL, cost_vec))
+      && STMT_VINFO_TYPE (stmt_info) != lc_phi_info_type
+      && !can_vectorize_live_stmts (stmt_info, NULL, node, node_instance,
+				    NULL, cost_vec))
     return opt_result::failure_at (stmt_info->stmt,
 				   "not vectorized:"
 				   " live stmt not supported: %G",
@@ -9864,8 +10099,7 @@ vect_transform_stmt (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       break;
 
     case condition_vec_info_type:
-      done = vectorizable_condition (stmt_info, gsi, &vec_stmt, false,
-				     slp_node, NULL);
+      done = vectorizable_condition (stmt_info, gsi, &vec_stmt, slp_node, NULL);
       gcc_assert (done);
       break;
 
@@ -9887,8 +10121,18 @@ vect_transform_stmt (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       break;
 
     case reduc_vec_info_type:
-      done = vectorizable_reduction (stmt_info, gsi, &vec_stmt, slp_node,
-				     slp_node_instance, NULL);
+      done = vect_transform_reduction (stmt_info, gsi, &vec_stmt, slp_node);
+      gcc_assert (done);
+      break;
+
+    case cycle_phi_info_type:
+      done = vect_transform_cycle_phi (stmt_info, &vec_stmt, slp_node,
+				       slp_node_instance);
+      gcc_assert (done);
+      break;
+
+    case lc_phi_info_type:
+      done = vectorizable_lc_phi (stmt_info, &vec_stmt, slp_node);
       gcc_assert (done);
       break;
 
@@ -9944,19 +10188,66 @@ vect_transform_stmt (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	  }
     }
 
-  /* Handle stmts whose DEF is used outside the loop-nest that is
-     being vectorized.  */
-  if (STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type)
-    {
-      done = can_vectorize_live_stmts (stmt_info, gsi, slp_node, &vec_stmt,
-				       NULL);
-      gcc_assert (done);
-    }
-
   if (vec_stmt)
     STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
 
-  return is_store;
+  if (STMT_VINFO_TYPE (stmt_info) == store_vec_info_type)
+    return is_store;
+
+  /* If this stmt defines a value used on a backedge, update the
+     vectorized PHIs.  */
+  stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
+  stmt_vec_info reduc_info;
+  if (STMT_VINFO_REDUC_DEF (orig_stmt_info)
+      && vect_stmt_to_vectorize (orig_stmt_info) == stmt_info
+      && (reduc_info = info_for_reduction (orig_stmt_info))
+      && STMT_VINFO_REDUC_TYPE (reduc_info) != FOLD_LEFT_REDUCTION
+      && STMT_VINFO_REDUC_TYPE (reduc_info) != EXTRACT_LAST_REDUCTION)
+    {
+      gphi *phi;
+      if (!slp_node
+	  && (phi = dyn_cast <gphi *>
+		      (STMT_VINFO_REDUC_DEF (orig_stmt_info)->stmt))
+	  && dominated_by_p (CDI_DOMINATORS,
+			     gimple_bb (orig_stmt_info->stmt), gimple_bb (phi)))
+	{
+	  edge e = loop_latch_edge (gimple_bb (phi)->loop_father);
+	  stmt_vec_info phi_info
+	    = STMT_VINFO_VEC_STMT (STMT_VINFO_REDUC_DEF (orig_stmt_info));
+	  stmt_vec_info vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
+	  do
+	    {
+	      add_phi_arg (as_a <gphi *> (phi_info->stmt),
+			   gimple_get_lhs (vec_stmt->stmt), e,
+			   gimple_phi_arg_location (phi, e->dest_idx));
+	      phi_info = STMT_VINFO_RELATED_STMT (phi_info);
+	      vec_stmt = STMT_VINFO_RELATED_STMT (vec_stmt);
+	    }
+	  while (phi_info);
+	  gcc_assert (!vec_stmt);
+	}
+      else if (slp_node
+	       && slp_node != slp_node_instance->reduc_phis)
+	{
+	  slp_tree phi_node = slp_node_instance->reduc_phis;
+	  gphi *phi = as_a <gphi *> (SLP_TREE_SCALAR_STMTS (phi_node)[0]->stmt);
+	  edge e = loop_latch_edge (gimple_bb (phi)->loop_father);
+	  gcc_assert (SLP_TREE_VEC_STMTS (phi_node).length ()
+		      == SLP_TREE_VEC_STMTS (slp_node).length ());
+	  for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
+	    add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]->stmt),
+			 gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt),
+			 e, gimple_phi_arg_location (phi, e->dest_idx));
+	}
+    }
+
+  /* Handle stmts whose DEF is used outside the loop-nest that is
+     being vectorized.  */
+  done = can_vectorize_live_stmts (stmt_info, gsi, slp_node,
+				   slp_node_instance, &vec_stmt, NULL);
+  gcc_assert (done);
+
+  return false;
 }
 
 
@@ -9979,18 +10270,28 @@ vect_remove_stores (stmt_vec_info first_stmt_info)
     }
 }
 
-/* Function get_vectype_for_scalar_type_and_size.
+/* If NUNITS is nonzero, return a vector type that contains NUNITS
+   elements of type SCALAR_TYPE, or null if the target doesn't support
+   such a type.
 
-   Returns the vector type corresponding to SCALAR_TYPE  and SIZE as supported
-   by the target.  */
+   If NUNITS is zero, return a vector type that contains elements of
+   type SCALAR_TYPE, choosing whichever vector size the target prefers.
+
+   If PREVAILING_MODE is VOIDmode, we have not yet chosen a vector mode
+   for this vectorization region and want to "autodetect" the best choice.
+   Otherwise, PREVAILING_MODE is a previously-chosen vector TYPE_MODE
+   and we want the new type to be interoperable with it.   PREVAILING_MODE
+   in this case can be a scalar integer mode or a vector mode; when it
+   is a vector mode, the function acts like a tree-level version of
+   related_vector_mode.  */
 
 tree
-get_vectype_for_scalar_type_and_size (tree scalar_type, poly_uint64 size)
+get_related_vectype_for_scalar_type (machine_mode prevailing_mode,
+				     tree scalar_type, poly_uint64 nunits)
 {
   tree orig_scalar_type = scalar_type;
   scalar_mode inner_mode;
   machine_mode simd_mode;
-  poly_uint64 nunits;
   tree vectype;
 
   if (!is_int_mode (TYPE_MODE (scalar_type), &inner_mode)
@@ -10030,19 +10331,45 @@ get_vectype_for_scalar_type_and_size (tree scalar_type, poly_uint64 size)
   if (scalar_type == NULL_TREE)
     return NULL_TREE;
 
-  /* If no size was supplied use the mode the target prefers.   Otherwise
-     lookup a vector mode of the specified size.  */
-  if (known_eq (size, 0U))
-    simd_mode = targetm.vectorize.preferred_simd_mode (inner_mode);
-  else if (!multiple_p (size, nbytes, &nunits)
-	   || !mode_for_vector (inner_mode, nunits).exists (&simd_mode))
-    return NULL_TREE;
-  /* NOTE: nunits == 1 is allowed to support single element vector types.  */
-  if (!multiple_p (GET_MODE_SIZE (simd_mode), nbytes, &nunits))
-    return NULL_TREE;
+  /* If no prevailing mode was supplied, use the mode the target prefers.
+     Otherwise lookup a vector mode based on the prevailing mode.  */
+  if (prevailing_mode == VOIDmode)
+    {
+      gcc_assert (known_eq (nunits, 0U));
+      simd_mode = targetm.vectorize.preferred_simd_mode (inner_mode);
+      if (SCALAR_INT_MODE_P (simd_mode))
+	{
+	  /* Traditional behavior is not to take the integer mode
+	     literally, but simply to use it as a way of determining
+	     the vector size.  It is up to mode_for_vector to decide
+	     what the TYPE_MODE should be.
+
+	     Note that nunits == 1 is allowed in order to support single
+	     element vector types.  */
+	  if (!multiple_p (GET_MODE_SIZE (simd_mode), nbytes, &nunits)
+	      || !mode_for_vector (inner_mode, nunits).exists (&simd_mode))
+	    return NULL_TREE;
+	}
+    }
+  else if (SCALAR_INT_MODE_P (prevailing_mode)
+	   || !related_vector_mode (prevailing_mode,
+				    inner_mode, nunits).exists (&simd_mode))
+    {
+      /* Fall back to using mode_for_vector, mostly in the hope of being
+	 able to use an integer mode.  */
+      if (known_eq (nunits, 0U)
+	  && !multiple_p (GET_MODE_SIZE (prevailing_mode), nbytes, &nunits))
+	return NULL_TREE;
 
-  vectype = build_vector_type (scalar_type, nunits);
+      if (!mode_for_vector (inner_mode, nunits).exists (&simd_mode))
+	return NULL_TREE;
+    }
+
+  vectype = build_vector_type_for_mode (scalar_type, simd_mode);
 
+  /* In cases where the mode was chosen by mode_for_vector, check that
+     the target actually supports the chosen mode, or that it at least
+     allows the vector mode to be replaced by a like-sized integer.  */
   if (!VECTOR_MODE_P (TYPE_MODE (vectype))
       && !INTEGRAL_MODE_P (TYPE_MODE (vectype)))
     return NULL_TREE;
@@ -10056,22 +10383,22 @@ get_vectype_for_scalar_type_and_size (tree scalar_type, poly_uint64 size)
   return vectype;
 }
 
-poly_uint64 current_vector_size;
-
 /* Function get_vectype_for_scalar_type.
 
    Returns the vector type corresponding to SCALAR_TYPE as supported
    by the target.  */
 
 tree
-get_vectype_for_scalar_type (tree scalar_type)
+get_vectype_for_scalar_type (vec_info *vinfo, tree scalar_type)
 {
-  tree vectype;
-  vectype = get_vectype_for_scalar_type_and_size (scalar_type,
-						  current_vector_size);
-  if (vectype
-      && known_eq (current_vector_size, 0U))
-    current_vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
+  tree vectype = get_related_vectype_for_scalar_type (vinfo->vector_mode,
+						      scalar_type);
+  if (vectype && vinfo->vector_mode == VOIDmode)
+    vinfo->vector_mode = TYPE_MODE (vectype);
+
+  if (vectype)
+    vinfo->used_vector_modes.add (TYPE_MODE (vectype));
+
   return vectype;
 }
 
@@ -10081,15 +10408,14 @@ get_vectype_for_scalar_type (tree scalar_type)
    of vectors of specified SCALAR_TYPE as supported by target.  */
 
 tree
-get_mask_type_for_scalar_type (tree scalar_type)
+get_mask_type_for_scalar_type (vec_info *vinfo, tree scalar_type)
 {
-  tree vectype = get_vectype_for_scalar_type (scalar_type);
+  tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
 
   if (!vectype)
     return NULL;
 
-  return build_truth_vector_type (TYPE_VECTOR_SUBPARTS (vectype),
-				  current_vector_size);
+  return truth_type_for (vectype);
 }
 
 /* Function get_same_sized_vectype
@@ -10101,10 +10427,29 @@ tree
 get_same_sized_vectype (tree scalar_type, tree vector_type)
 {
   if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type))
-    return build_same_sized_truth_vector_type (vector_type);
+    return truth_type_for (vector_type);
+
+  poly_uint64 nunits;
+  if (!multiple_p (GET_MODE_SIZE (TYPE_MODE (vector_type)),
+		   GET_MODE_SIZE (TYPE_MODE (scalar_type)), &nunits))
+    return NULL_TREE;
+
+  return get_related_vectype_for_scalar_type (TYPE_MODE (vector_type),
+					      scalar_type, nunits);
+}
+
+/* Return true if replacing LOOP_VINFO->vector_mode with VECTOR_MODE
+   would not change the chosen vector modes.  */
 
-  return get_vectype_for_scalar_type_and_size
-	   (scalar_type, GET_MODE_SIZE (TYPE_MODE (vector_type)));
+bool
+vect_chooses_same_modes_p (vec_info *vinfo, machine_mode vector_mode)
+{
+  for (vec_info::mode_set::iterator i = vinfo->used_vector_modes.begin ();
+       i != vinfo->used_vector_modes.end (); ++i)
+    if (!VECTOR_MODE_P (*i)
+	|| related_vector_mode (vector_mode, GET_MODE_INNER (*i), 0) != *i)
+      return false;
+  return true;
 }
 
 /* Function vect_is_simple_use.
@@ -10492,11 +10837,8 @@ supportable_widening_operation (enum tree_code code, stmt_vec_info stmt_info,
     {
       intermediate_mode = insn_data[icode1].operand[0].mode;
       if (VECTOR_BOOLEAN_TYPE_P (prev_type))
-	{
-	  intermediate_type = vect_halve_mask_nunits (prev_type);
-	  if (intermediate_mode != TYPE_MODE (intermediate_type))
-	    return false;
-	}
+	intermediate_type
+	  = vect_halve_mask_nunits (prev_type, intermediate_mode);
       else
 	intermediate_type
 	  = lang_hooks.types.type_for_mode (intermediate_mode,
@@ -10680,11 +11022,8 @@ supportable_narrowing_operation (enum tree_code code,
     {
       intermediate_mode = insn_data[icode1].operand[0].mode;
       if (VECTOR_BOOLEAN_TYPE_P (prev_type))
-	{
-	  intermediate_type = vect_double_mask_nunits (prev_type);
-	  if (intermediate_mode != TYPE_MODE (intermediate_type))
-	    return false;
-	}
+	intermediate_type
+	  = vect_double_mask_nunits (prev_type, intermediate_mode);
       else
 	intermediate_type
 	  = lang_hooks.types.type_for_mode (intermediate_mode, uns);
@@ -10777,6 +11116,7 @@ vect_get_vector_types_for_stmt (stmt_vec_info stmt_info,
 				tree *stmt_vectype_out,
 				tree *nunits_vectype_out)
 {
+  vec_info *vinfo = stmt_info->vinfo;
   gimple *stmt = stmt_info->stmt;
 
   *stmt_vectype_out = NULL_TREE;
@@ -10810,7 +11150,12 @@ vect_get_vector_types_for_stmt (stmt_vec_info stmt_info,
   tree vectype;
   tree scalar_type = NULL_TREE;
   if (STMT_VINFO_VECTYPE (stmt_info))
-    *stmt_vectype_out = vectype = STMT_VINFO_VECTYPE (stmt_info);
+    {
+      *stmt_vectype_out = vectype = STMT_VINFO_VECTYPE (stmt_info);
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "precomputed vectype: %T\n", vectype);
+    }
   else
     {
       gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
@@ -10842,8 +11187,8 @@ vect_get_vector_types_for_stmt (stmt_vec_info stmt_info,
 
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location,
-			 "get vectype for scalar type:  %T\n", scalar_type);
-      vectype = get_vectype_for_scalar_type (scalar_type);
+			 "get vectype for scalar type: %T\n", scalar_type);
+      vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
       if (!vectype)
 	return opt_result::failure_at (stmt,
 				       "not vectorized:"
@@ -10859,42 +11204,38 @@ vect_get_vector_types_for_stmt (stmt_vec_info stmt_info,
 
   /* Don't try to compute scalar types if the stmt produces a boolean
      vector; use the existing vector type instead.  */
-  tree nunits_vectype;
-  if (VECTOR_BOOLEAN_TYPE_P (vectype))
-    nunits_vectype = vectype;
-  else
+  tree nunits_vectype = vectype;
+  if (!VECTOR_BOOLEAN_TYPE_P (vectype)
+      && *stmt_vectype_out != boolean_type_node)
     {
       /* The number of units is set according to the smallest scalar
 	 type (or the largest vector size, but we only support one
 	 vector size per vectorization).  */
-      if (*stmt_vectype_out != boolean_type_node)
+      HOST_WIDE_INT dummy;
+      scalar_type = vect_get_smallest_scalar_type (stmt_info, &dummy, &dummy);
+      if (scalar_type != TREE_TYPE (vectype))
 	{
-	  HOST_WIDE_INT dummy;
-	  scalar_type = vect_get_smallest_scalar_type (stmt_info,
-						       &dummy, &dummy);
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "get vectype for smallest scalar type: %T\n",
+			     scalar_type);
+	  nunits_vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
+	  if (!nunits_vectype)
+	    return opt_result::failure_at
+	      (stmt, "not vectorized: unsupported data-type %T\n",
+	       scalar_type);
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location, "nunits vectype: %T\n",
+			     nunits_vectype);
 	}
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_NOTE, vect_location,
-			 "get vectype for scalar type:  %T\n", scalar_type);
-      nunits_vectype = get_vectype_for_scalar_type (scalar_type);
     }
-  if (!nunits_vectype)
-    return opt_result::failure_at (stmt,
-				   "not vectorized: unsupported data-type %T\n",
-				   scalar_type);
 
-  if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
-		GET_MODE_SIZE (TYPE_MODE (nunits_vectype))))
-    return opt_result::failure_at (stmt,
-				   "not vectorized: different sized vector "
-				   "types in statement, %T and %T\n",
-				   vectype, nunits_vectype);
+  gcc_assert (*stmt_vectype_out == boolean_type_node
+	      || multiple_p (TYPE_VECTOR_SUBPARTS (nunits_vectype),
+			     TYPE_VECTOR_SUBPARTS (*stmt_vectype_out)));
 
   if (dump_enabled_p ())
     {
-      dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
-		       nunits_vectype);
-
       dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
       dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (nunits_vectype));
       dump_printf (MSG_NOTE, "\n");
@@ -10911,6 +11252,7 @@ vect_get_vector_types_for_stmt (stmt_vec_info stmt_info,
 opt_tree
 vect_get_mask_type_for_stmt (stmt_vec_info stmt_info)
 {
+  vec_info *vinfo = stmt_info->vinfo;
   gimple *stmt = stmt_info->stmt;
   tree mask_type = NULL;
   tree vectype, scalar_type;
@@ -10920,7 +11262,7 @@ vect_get_mask_type_for_stmt (stmt_vec_info stmt_info)
       && !VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (gimple_assign_rhs1 (stmt))))
     {
       scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
-      mask_type = get_mask_type_for_scalar_type (scalar_type);
+      mask_type = get_mask_type_for_scalar_type (vinfo, scalar_type);
 
       if (!mask_type)
 	return opt_tree::failure_at (stmt,
@@ -10968,7 +11310,7 @@ vect_get_mask_type_for_stmt (stmt_vec_info stmt_info)
 	  && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 	  && gimple_code (stmt) == GIMPLE_ASSIGN
 	  && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
-	mask_type = build_same_sized_truth_vector_type (mask_type);
+	mask_type = truth_type_for (mask_type);
     }
 
   /* No mask_type should mean loop invariant predicate.
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index d89ec3b7c76..c2c6377d3f9 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -639,8 +639,11 @@ vec_info::new_stmt_vec_info (gimple *stmt)
   STMT_VINFO_TYPE (res) = undef_vec_info_type;
   STMT_VINFO_RELEVANT (res) = vect_unused_in_scope;
   STMT_VINFO_VECTORIZABLE (res) = true;
-  STMT_VINFO_VEC_REDUCTION_TYPE (res) = TREE_CODE_REDUCTION;
-  STMT_VINFO_VEC_CONST_COND_REDUC_CODE (res) = ERROR_MARK;
+  STMT_VINFO_REDUC_TYPE (res) = TREE_CODE_REDUCTION;
+  STMT_VINFO_REDUC_CODE (res) = ERROR_MARK;
+  STMT_VINFO_REDUC_FN (res) = IFN_LAST;
+  STMT_VINFO_REDUC_IDX (res) = -1;
+  STMT_VINFO_SLP_VECT_ONLY (res) = false;
 
   if (gimple_code (stmt) == GIMPLE_PHI
       && is_loop_header_bb_p (gimple_bb (stmt)))
@@ -862,8 +865,7 @@ set_uid_loop_bbs (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 
 static unsigned
 try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
-		      unsigned *num_vectorized_loops,
-		      loop_p loop, loop_vec_info orig_loop_vinfo,
+		      unsigned *num_vectorized_loops, loop_p loop,
 		      gimple *loop_vectorized_call,
 		      gimple *loop_dist_alias_call)
 {
@@ -871,6 +873,7 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
   vec_info_shared shared;
   auto_purge_vect_location sentinel;
   vect_location = find_loop_location (loop);
+
   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
       && dump_enabled_p ())
     dump_printf (MSG_NOTE | MSG_PRIORITY_INTERNALS,
@@ -878,10 +881,17 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
 		 LOCATION_FILE (vect_location.get_location_t ()),
 		 LOCATION_LINE (vect_location.get_location_t ()));
 
-  /* Try to analyze the loop, retaining an opt_problem if dump_enabled_p.  */
-  opt_loop_vec_info loop_vinfo
-    = vect_analyze_loop (loop, orig_loop_vinfo, &shared);
-  loop->aux = loop_vinfo;
+  opt_loop_vec_info loop_vinfo = opt_loop_vec_info::success (NULL);
+  /* In the case of epilogue vectorization the loop already has its
+     loop_vec_info set, we do not require to analyze the loop in this case.  */
+  if (loop_vec_info vinfo = loop_vec_info_for_loop (loop))
+    loop_vinfo = opt_loop_vec_info::success (vinfo);
+  else
+    {
+      /* Try to analyze the loop, retaining an opt_problem if dump_enabled_p.  */
+      loop_vinfo = vect_analyze_loop (loop, &shared);
+      loop->aux = loop_vinfo;
+    }
 
   if (!loop_vinfo)
     if (dump_enabled_p ())
@@ -968,7 +978,7 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
   unsigned HOST_WIDE_INT bytes;
   if (dump_enabled_p ())
     {
-      if (current_vector_size.is_constant (&bytes))
+      if (GET_MODE_SIZE (loop_vinfo->vector_mode).is_constant (&bytes))
 	dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
 			 "loop vectorized using %wu byte vectors\n", bytes);
       else
@@ -1009,8 +1019,13 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
 
   /* Epilogue of vectorized loop must be vectorized too.  */
   if (new_loop)
-    ret |= try_vectorize_loop_1 (simduid_to_vf_htab, num_vectorized_loops,
-				 new_loop, loop_vinfo, NULL, NULL);
+    {
+      /* Don't include vectorized epilogues in the "vectorized loops" count.
+       */
+      unsigned dont_count = *num_vectorized_loops;
+      ret |= try_vectorize_loop_1 (simduid_to_vf_htab, &dont_count,
+				   new_loop, NULL, NULL);
+    }
 
   return ret;
 }
@@ -1026,8 +1041,7 @@ try_vectorize_loop (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
 	|| loop->force_vectorize))
     return 0;
 
-  return try_vectorize_loop_1 (simduid_to_vf_htab, num_vectorized_loops,
-			       loop, NULL,
+  return try_vectorize_loop_1 (simduid_to_vf_htab, num_vectorized_loops, loop,
 			       vect_loop_vectorized_call (loop),
 			       vect_loop_dist_alias_call (loop));
 }
@@ -1344,7 +1358,8 @@ get_vec_alignment_for_array_type (tree type)
   gcc_assert (TREE_CODE (type) == ARRAY_TYPE);
   poly_uint64 array_size, vector_size;
 
-  tree vectype = get_vectype_for_scalar_type (strip_array_types (type));
+  tree scalar_type = strip_array_types (type);
+  tree vectype = get_related_vectype_for_scalar_type (VOIDmode, scalar_type);
   if (!vectype
       || !poly_int_tree_p (TYPE_SIZE (type), &array_size)
       || !poly_int_tree_p (TYPE_SIZE (vectype), &vector_size)
@@ -1512,3 +1527,36 @@ make_pass_ipa_increase_alignment (gcc::context *ctxt)
 {
   return new pass_ipa_increase_alignment (ctxt);
 }
+
+/* If the condition represented by T is a comparison or the SSA name
+   result of a comparison, extract the comparison's operands.  Represent
+   T as NE_EXPR <T, 0> otherwise.  */
+
+void
+scalar_cond_masked_key::get_cond_ops_from_tree (tree t)
+{
+  if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison)
+    {
+      this->code = TREE_CODE (t);
+      this->op0 = TREE_OPERAND (t, 0);
+      this->op1 = TREE_OPERAND (t, 1);
+      return;
+    }
+
+  if (TREE_CODE (t) == SSA_NAME)
+    if (gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (t)))
+      {
+	tree_code code = gimple_assign_rhs_code (stmt);
+	if (TREE_CODE_CLASS (code) == tcc_comparison)
+	  {
+	    this->code = code;
+	    this->op0 = gimple_assign_rhs1 (stmt);
+	    this->op1 = gimple_assign_rhs2 (stmt);
+	    return;
+	  }
+      }
+
+  this->code = NE_EXPR;
+  this->op0 = t;
+  this->op1 = build_zero_cst (TREE_TYPE (t));
+}
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 148b9a7f215..c46e2742c36 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -26,6 +26,7 @@ typedef struct _stmt_vec_info *stmt_vec_info;
 #include "tree-data-ref.h"
 #include "tree-hash-traits.h"
 #include "target.h"
+#include <utility>
 
 /* Used for naming of new temporaries.  */
 enum vect_var_kind {
@@ -120,6 +121,8 @@ struct _slp_tree {
   vec<slp_tree> children;
   /* A group of scalar stmts to be vectorized together.  */
   vec<stmt_vec_info> stmts;
+  /* A group of scalar operands to be vectorized together.  */
+  vec<tree> ops;
   /* Load permutation relative to the stores, NULL if there is no
      permutation.  */
   vec<unsigned> load_permutation;
@@ -170,13 +173,82 @@ typedef struct _slp_instance {
 
 #define SLP_TREE_CHILDREN(S)                     (S)->children
 #define SLP_TREE_SCALAR_STMTS(S)                 (S)->stmts
+#define SLP_TREE_SCALAR_OPS(S)                   (S)->ops
 #define SLP_TREE_VEC_STMTS(S)                    (S)->vec_stmts
 #define SLP_TREE_NUMBER_OF_VEC_STMTS(S)          (S)->vec_stmts_size
 #define SLP_TREE_LOAD_PERMUTATION(S)             (S)->load_permutation
 #define SLP_TREE_TWO_OPERATORS(S)		 (S)->two_operators
 #define SLP_TREE_DEF_TYPE(S)			 (S)->def_type
 
+/* Key for map that records association between
+   scalar conditions and corresponding loop mask, and
+   is populated by vect_record_loop_mask.  */
 
+struct scalar_cond_masked_key
+{
+  scalar_cond_masked_key (tree t, unsigned ncopies_)
+    : ncopies (ncopies_)
+  {
+    get_cond_ops_from_tree (t);
+  }
+
+  void get_cond_ops_from_tree (tree);
+
+  unsigned ncopies;
+  tree_code code;
+  tree op0;
+  tree op1;
+};
+
+template<>
+struct default_hash_traits<scalar_cond_masked_key>
+{
+  typedef scalar_cond_masked_key compare_type;
+  typedef scalar_cond_masked_key value_type;
+
+  static inline hashval_t
+  hash (value_type v)
+  {
+    inchash::hash h;
+    h.add_int (v.code);
+    inchash::add_expr (v.op0, h, 0);
+    inchash::add_expr (v.op1, h, 0);
+    h.add_int (v.ncopies);
+    return h.end ();
+  }
+
+  static inline bool
+  equal (value_type existing, value_type candidate)
+  {
+    return (existing.ncopies == candidate.ncopies
+           && existing.code == candidate.code
+           && operand_equal_p (existing.op0, candidate.op0, 0)
+           && operand_equal_p (existing.op1, candidate.op1, 0));
+  }
+
+  static inline void
+  mark_empty (value_type &v)
+  {
+    v.ncopies = 0;
+  }
+
+  static inline bool
+  is_empty (value_type v)
+  {
+    return v.ncopies == 0;
+  }
+
+  static inline void mark_deleted (value_type &) {}
+
+  static inline bool is_deleted (const value_type &)
+  {
+    return false;
+  }
+
+  static inline void remove (value_type &) {}
+};
+
+typedef hash_set<scalar_cond_masked_key> scalar_cond_masked_set_type;
 
 /* Describes two objects whose addresses must be unequal for the vectorized
    loop to be valid.  */
@@ -217,6 +289,7 @@ struct vec_info_shared {
 
 /* Vectorizer state common between loop and basic-block vectorization.  */
 struct vec_info {
+  typedef hash_set<int_hash<machine_mode, E_VOIDmode, E_BLKmode> > mode_set;
   enum vec_kind { bb, loop };
 
   vec_info (vec_kind, void *, vec_info_shared *);
@@ -254,6 +327,14 @@ struct vec_info {
   /* Cost data used by the target cost model.  */
   void *target_cost_data;
 
+  /* The set of vector modes used in the vectorized region.  */
+  mode_set used_vector_modes;
+
+  /* The argument we should pass to related_vector_mode when looking up
+     the vector mode for a scalar mode, or VOIDmode if we haven't yet
+     made any decisions about which vector modes to use.  */
+  machine_mode vector_mode;
+
 private:
   stmt_vec_info new_stmt_vec_info (gimple *stmt);
   void set_vinfo_for_stmt (gimple *, stmt_vec_info);
@@ -377,6 +458,8 @@ struct rgroup_masks {
 
 typedef auto_vec<rgroup_masks> vec_loop_masks;
 
+typedef auto_vec<std::pair<data_reference*, tree> > drs_init_vec;
+
 /*-----------------------------------------------------------------*/
 /* Info on vectorized loops.                                       */
 /*-----------------------------------------------------------------*/
@@ -399,7 +482,7 @@ typedef struct _loop_vec_info : public vec_info {
   /* Condition under which this loop is analyzed and versioned.  */
   tree num_iters_assumptions;
 
-  /* Threshold of number of iterations below which vectorzation will not be
+  /* Threshold of number of iterations below which vectorization will not be
      performed. It is calculated from MIN_PROFITABLE_ITERS and
      PARAM_MIN_VECT_LOOP_BOUND.  */
   unsigned int th;
@@ -421,6 +504,9 @@ typedef struct _loop_vec_info : public vec_info {
      on inactive scalars.  */
   vec_loop_masks masks;
 
+  /* Set of scalar conditions that have loop mask applied.  */
+  scalar_cond_masked_set_type scalar_cond_masked_set;
+
   /* If we are using a loop mask to align memory addresses, this variable
      contains the number of vector elements that we should skip in the
      first iteration of the vector loop (i.e. the number of leading
@@ -497,6 +583,13 @@ typedef struct _loop_vec_info : public vec_info {
   /* Cost of a single scalar iteration.  */
   int single_scalar_iteration_cost;
 
+  /* The cost of the vector prologue and epilogue, including peeled
+     iterations and set-up code.  */
+  int vec_outside_cost;
+
+  /* The cost of the vector loop body.  */
+  int vec_inside_cost;
+
   /* Is the loop vectorizable? */
   bool vectorizable;
 
@@ -551,6 +644,10 @@ typedef struct _loop_vec_info : public vec_info {
      this points to the original vectorized loop.  Otherwise NULL.  */
   _loop_vec_info *orig_loop_info;
 
+  /* Used to store loop_vec_infos of epilogues of this loop during
+     analysis.  */
+  vec<_loop_vec_info *> epilogue_vinfos;
+
 } *loop_vec_info;
 
 /* Access Functions.  */
@@ -682,6 +779,8 @@ enum stmt_vec_info_type {
   type_promotion_vec_info_type,
   type_demotion_vec_info_type,
   type_conversion_vec_info_type,
+  cycle_phi_info_type,
+  lc_phi_info_type,
   loop_exit_ctrl_vec_info_type
 };
 
@@ -917,21 +1016,42 @@ struct _stmt_vec_info {
      for loop vectorization.  */
   vect_memory_access_type memory_access_type;
 
-  /* For reduction loops, this is the type of reduction.  */
-  enum vect_reduction_type v_reduc_type;
+  /* For INTEGER_INDUC_COND_REDUCTION, the initial value to be used.  */
+  tree induc_cond_initial_val;
 
-  /* For CONST_COND_REDUCTION, record the reduc code.  */
-  enum tree_code const_cond_reduc_code;
+  /* If not NULL the value to be added to compute final reduction value.  */
+  tree reduc_epilogue_adjustment;
 
   /* On a reduction PHI the reduction type as detected by
-     vect_force_simple_reduction.  */
+     vect_is_simple_reduction and vectorizable_reduction.  */
   enum vect_reduction_type reduc_type;
 
+  /* The original reduction code, to be used in the epilogue.  */
+  enum tree_code reduc_code;
+  /* An internal function we should use in the epilogue.  */
+  internal_fn reduc_fn;
+
+  /* On a stmt participating in the reduction the index of the operand
+     on the reduction SSA cycle.  */
+  int reduc_idx;
+
   /* On a reduction PHI the def returned by vect_force_simple_reduction.
      On the def returned by vect_force_simple_reduction the
      corresponding PHI.  */
   stmt_vec_info reduc_def;
 
+  /* The vector input type relevant for reduction vectorization.  */
+  tree reduc_vectype_in;
+
+  /* The vector type for performing the actual reduction.  */
+  tree reduc_vectype;
+
+  /* Whether we force a single cycle PHI during reduction vectorization.  */
+  bool force_single_cycle;
+
+  /* Whether on this stmt reduction meta is recorded.  */
+  bool is_reduc_info;
+
   /* The number of scalar stmt references from active SLP instances.  */
   unsigned int num_slp_uses;
 
@@ -949,6 +1069,9 @@ struct _stmt_vec_info {
      and OPERATION_BITS without changing the result.  */
   unsigned int operation_precision;
   signop operation_sign;
+
+  /* True if this is only suitable for SLP vectorization.  */
+  bool slp_vect_only_p;
 };
 
 /* Information about a gather/scatter call.  */
@@ -1011,8 +1134,10 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo)
 #define STMT_VINFO_STRIDED_P(S)	   	   (S)->strided_p
 #define STMT_VINFO_MEMORY_ACCESS_TYPE(S)   (S)->memory_access_type
 #define STMT_VINFO_SIMD_LANE_ACCESS_P(S)   (S)->simd_lane_access_p
-#define STMT_VINFO_VEC_REDUCTION_TYPE(S)   (S)->v_reduc_type
-#define STMT_VINFO_VEC_CONST_COND_REDUC_CODE(S) (S)->const_cond_reduc_code
+#define STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL(S) (S)->induc_cond_initial_val
+#define STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT(S) (S)->reduc_epilogue_adjustment
+#define STMT_VINFO_REDUC_IDX(S)		   (S)->reduc_idx
+#define STMT_VINFO_FORCE_SINGLE_CYCLE(S)   (S)->force_single_cycle
 
 #define STMT_VINFO_DR_WRT_VEC_LOOP(S)      (S)->dr_wrt_vec_loop
 #define STMT_VINFO_DR_BASE_ADDRESS(S)      (S)->dr_wrt_vec_loop.base_address
@@ -1043,7 +1168,12 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo)
 #define STMT_VINFO_MIN_NEG_DIST(S)	(S)->min_neg_dist
 #define STMT_VINFO_NUM_SLP_USES(S)	(S)->num_slp_uses
 #define STMT_VINFO_REDUC_TYPE(S)	(S)->reduc_type
+#define STMT_VINFO_REDUC_CODE(S)	(S)->reduc_code
+#define STMT_VINFO_REDUC_FN(S)		(S)->reduc_fn
 #define STMT_VINFO_REDUC_DEF(S)		(S)->reduc_def
+#define STMT_VINFO_REDUC_VECTYPE(S)     (S)->reduc_vectype
+#define STMT_VINFO_REDUC_VECTYPE_IN(S)  (S)->reduc_vectype_in
+#define STMT_VINFO_SLP_VECT_ONLY(S)     (S)->slp_vect_only_p
 
 #define DR_GROUP_FIRST_ELEMENT(S) \
   (gcc_checking_assert ((S)->dr_aux.dr), (S)->first_element)
@@ -1358,7 +1488,7 @@ vect_get_num_copies (loop_vec_info loop_vinfo, tree vectype)
 static inline void
 vect_update_max_nunits (poly_uint64 *max_nunits, poly_uint64 nunits)
 {
-  /* All unit counts have the form current_vector_size * X for some
+  /* All unit counts have the form vec_info::vector_size * X for some
      rational X, so two unit sizes must have a common multiple.
      Everything is a multiple of the initial value of 1.  */
   *max_nunits = force_common_multiple (*max_nunits, nunits);
@@ -1466,20 +1596,22 @@ extern void vect_set_loop_condition (struct loop *, loop_vec_info,
 extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge);
 struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *,
 						     struct loop *, edge);
-struct loop *vect_loop_versioning (loop_vec_info, unsigned int, bool,
-				   poly_uint64);
+struct loop *vect_loop_versioning (loop_vec_info);
 extern struct loop *vect_do_peeling (loop_vec_info, tree, tree,
-				     tree *, tree *, tree *, int, bool, bool);
+				    tree *, tree *, tree *, int, bool, bool,
+				    tree *, drs_init_vec &);
 extern void vect_prepare_for_masked_peels (loop_vec_info);
 extern dump_user_location_t find_loop_location (struct loop *);
 extern bool vect_can_advance_ivs_p (loop_vec_info);
+extern void vect_update_inits_of_drs (loop_vec_info, tree, tree_code);
 
 /* In tree-vect-stmts.c.  */
-extern poly_uint64 current_vector_size;
-extern tree get_vectype_for_scalar_type (tree);
-extern tree get_vectype_for_scalar_type_and_size (tree, poly_uint64);
-extern tree get_mask_type_for_scalar_type (tree);
+extern tree get_related_vectype_for_scalar_type (machine_mode, tree,
+						 poly_uint64 = 0);
+extern tree get_vectype_for_scalar_type (vec_info *, tree);
+extern tree get_mask_type_for_scalar_type (vec_info *, tree);
 extern tree get_same_sized_vectype (tree, tree);
+extern bool vect_chooses_same_modes_p (vec_info *, machine_mode);
 extern bool vect_get_loop_mask_type (loop_vec_info);
 extern bool vect_is_simple_use (tree, vec_info *, enum vect_def_type *,
 				stmt_vec_info * = NULL, gimple ** = NULL);
@@ -1491,15 +1623,15 @@ extern bool supportable_widening_operation (enum tree_code, stmt_vec_info,
 					    enum tree_code *, int *,
 					    vec<tree> *);
 extern bool supportable_narrowing_operation (enum tree_code, tree, tree,
-					     enum tree_code *,
-					     int *, vec<tree> *);
+					     enum tree_code *, int *,
+					     vec<tree> *);
 extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
 				  enum vect_cost_for_stmt, stmt_vec_info,
 				  int, enum vect_cost_model_location);
 extern stmt_vec_info vect_finish_replace_stmt (stmt_vec_info, gimple *);
 extern stmt_vec_info vect_finish_stmt_generation (stmt_vec_info, gimple *,
 						  gimple_stmt_iterator *);
-extern opt_result vect_mark_stmts_to_be_vectorized (loop_vec_info);
+extern opt_result vect_mark_stmts_to_be_vectorized (loop_vec_info, bool *);
 extern tree vect_get_store_rhs (stmt_vec_info);
 extern tree vect_get_vec_def_for_operand_1 (stmt_vec_info, enum vect_def_type);
 extern tree vect_get_vec_def_for_operand (tree, stmt_vec_info, tree = NULL);
@@ -1515,19 +1647,13 @@ extern bool vect_transform_stmt (stmt_vec_info, gimple_stmt_iterator *,
 extern void vect_remove_stores (stmt_vec_info);
 extern opt_result vect_analyze_stmt (stmt_vec_info, bool *, slp_tree,
 				     slp_instance, stmt_vector_for_cost *);
-extern bool vectorizable_condition (stmt_vec_info, gimple_stmt_iterator *,
-				    stmt_vec_info *, bool, slp_tree,
-				    stmt_vector_for_cost *);
-extern bool vectorizable_shift (stmt_vec_info, gimple_stmt_iterator *,
-				stmt_vec_info *, slp_tree,
-				stmt_vector_for_cost *);
 extern void vect_get_load_cost (stmt_vec_info, int, bool,
 				unsigned int *, unsigned int *,
 				stmt_vector_for_cost *,
 				stmt_vector_for_cost *, bool);
 extern void vect_get_store_cost (stmt_vec_info, int,
 				 unsigned int *, stmt_vector_for_cost *);
-extern bool vect_supportable_shift (enum tree_code, tree);
+extern bool vect_supportable_shift (vec_info *, enum tree_code, tree);
 extern tree vect_gen_perm_mask_any (tree, const vec_perm_indices &);
 extern tree vect_gen_perm_mask_checked (tree, const vec_perm_indices &);
 extern void optimize_mask_stores (struct loop*);
@@ -1557,7 +1683,7 @@ extern bool vect_check_gather_scatter (stmt_vec_info, loop_vec_info,
 				       gather_scatter_info *);
 extern opt_result vect_find_stmt_data_reference (loop_p, gimple *,
 						 vec<data_reference_p> *);
-extern opt_result vect_analyze_data_refs (vec_info *, poly_uint64 *);
+extern opt_result vect_analyze_data_refs (vec_info *, poly_uint64 *, bool *);
 extern void vect_record_base_alignments (vec_info *);
 extern tree vect_create_data_ref_ptr (stmt_vec_info, tree, struct loop *, tree,
 				      tree *, gimple_stmt_iterator *,
@@ -1586,40 +1712,43 @@ extern tree vect_create_addr_base_for_vector_ref (stmt_vec_info, gimple_seq *,
 						  tree, tree = NULL_TREE);
 
 /* In tree-vect-loop.c.  */
-/* FORNOW: Used in tree-parloops.c.  */
-extern stmt_vec_info vect_force_simple_reduction (loop_vec_info, stmt_vec_info,
-						  bool *, bool);
-/* Used in gimple-loop-interchange.c.  */
+/* Used in tree-vect-loop-manip.c */
+extern void determine_peel_for_niter (loop_vec_info);
+/* Used in gimple-loop-interchange.c and tree-parloops.c.  */
 extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree,
 				  enum tree_code);
+extern bool needs_fold_left_reduction_p (tree, tree_code);
 /* Drive for loop analysis stage.  */
-extern opt_loop_vec_info vect_analyze_loop (struct loop *,
-					    loop_vec_info,
-					    vec_info_shared *);
+extern opt_loop_vec_info vect_analyze_loop (struct loop *, vec_info_shared *);
 extern tree vect_build_loop_niters (loop_vec_info, bool * = NULL);
 extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *,
 					 tree *, bool);
-extern tree vect_halve_mask_nunits (tree);
-extern tree vect_double_mask_nunits (tree);
+extern tree vect_halve_mask_nunits (tree, machine_mode);
+extern tree vect_double_mask_nunits (tree, machine_mode);
 extern void vect_record_loop_mask (loop_vec_info, vec_loop_masks *,
-				   unsigned int, tree);
+				   unsigned int, tree, tree);
 extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
 				unsigned int, tree, unsigned int);
+extern stmt_vec_info info_for_reduction (stmt_vec_info);
 
 /* Drive for loop transformation stage.  */
 extern struct loop *vect_transform_loop (loop_vec_info);
 extern opt_loop_vec_info vect_analyze_loop_form (struct loop *,
 						 vec_info_shared *);
 extern bool vectorizable_live_operation (stmt_vec_info, gimple_stmt_iterator *,
-					 slp_tree, int, stmt_vec_info *,
+					 slp_tree, slp_instance, int,
+					 stmt_vec_info *,
 					 stmt_vector_for_cost *);
-extern bool vectorizable_reduction (stmt_vec_info, gimple_stmt_iterator *,
-				    stmt_vec_info *, slp_tree, slp_instance,
+extern bool vectorizable_reduction (stmt_vec_info, slp_tree, slp_instance,
 				    stmt_vector_for_cost *);
 extern bool vectorizable_induction (stmt_vec_info, gimple_stmt_iterator *,
 				    stmt_vec_info *, slp_tree,
 				    stmt_vector_for_cost *);
-extern tree get_initial_def_for_reduction (stmt_vec_info, tree, tree *);
+extern bool vect_transform_reduction (stmt_vec_info, gimple_stmt_iterator *,
+				      stmt_vec_info *, slp_tree);
+extern bool vect_transform_cycle_phi (stmt_vec_info, stmt_vec_info *,
+				      slp_tree, slp_instance);
+extern bool vectorizable_lc_phi (stmt_vec_info, stmt_vec_info *, slp_tree);
 extern bool vect_worthwhile_without_simd_p (vec_info *, tree_code);
 extern int vect_get_known_peeling_cost (loop_vec_info, int, int *,
 					stmt_vector_for_cost *,
@@ -1637,15 +1766,16 @@ extern void vect_schedule_slp (vec_info *);
 extern opt_result vect_analyze_slp (vec_info *, unsigned);
 extern bool vect_make_slp_decision (loop_vec_info);
 extern void vect_detect_hybrid_slp (loop_vec_info);
-extern void vect_get_slp_defs (vec<tree> , slp_tree, vec<vec<tree> > *);
+extern void vect_get_slp_defs (slp_tree, vec<vec<tree> > *, unsigned n = -1U);
 extern bool vect_slp_bb (basic_block);
 extern stmt_vec_info vect_find_last_scalar_stmt_in_slp (slp_tree);
 extern bool is_simple_and_all_uses_invariant (stmt_vec_info, loop_vec_info);
-extern bool can_duplicate_and_interleave_p (unsigned int, machine_mode,
+extern bool can_duplicate_and_interleave_p (vec_info *, unsigned int,
+					    machine_mode,
 					    unsigned int * = NULL,
 					    tree * = NULL, tree * = NULL);
-extern void duplicate_and_interleave (gimple_seq *, tree, vec<tree>,
-				      unsigned int, vec<tree> &);
+extern void duplicate_and_interleave (vec_info *, gimple_seq *, tree,
+				      vec<tree>, unsigned int, vec<tree> &);
 extern int vect_get_place_in_interleaving_chain (stmt_vec_info, stmt_vec_info);
 
 /* In tree-vect-patterns.c.  */
diff --git a/gcc/tree-vrp.c b/gcc/tree-vrp.c
index 2140101d7d2..fbcd8aa6367 100644
--- a/gcc/tree-vrp.c
+++ b/gcc/tree-vrp.c
@@ -69,23 +69,20 @@ along with GCC; see the file COPYING3.  If not see
 #include "builtins.h"
 #include "wide-int-range.h"
 
+static bool
+ranges_from_anti_range (const value_range_base *ar,
+			value_range_base *vr0, value_range_base *vr1,
+			bool handle_pointers = false);
+
 /* Set of SSA names found live during the RPO traversal of the function
    for still active basic-blocks.  */
 static sbitmap *live;
 
-void
-value_range_base::set (enum value_range_kind kind, tree min, tree max)
-{
-  m_kind = kind;
-  m_min = min;
-  m_max = max;
-  if (flag_checking)
-    check ();
-}
-
 void
 value_range::set_equiv (bitmap equiv)
 {
+  if (undefined_p () || varying_p ())
+    equiv = NULL;
   /* Since updating the equivalence set involves deep copying the
      bitmaps, only do it if absolutely necessary.
 
@@ -261,7 +258,8 @@ value_range_base::constant_p () const
 void
 value_range_base::set_undefined ()
 {
-  set (VR_UNDEFINED, NULL, NULL);
+  m_kind = VR_UNDEFINED;
+  m_min = m_max = NULL;
 }
 
 void
@@ -273,7 +271,8 @@ value_range::set_undefined ()
 void
 value_range_base::set_varying ()
 {
-  set (VR_VARYING, NULL, NULL);
+  m_kind = VR_VARYING;
+  m_min = m_max = NULL;
 }
 
 void
@@ -335,6 +334,24 @@ value_range::equiv_add (const_tree var,
 bool
 value_range_base::singleton_p (tree *result) const
 {
+  if (m_kind == VR_ANTI_RANGE)
+    {
+      if (nonzero_p ())
+	{
+	  if (TYPE_PRECISION (type ()) == 1)
+	    {
+	      if (result)
+		*result = m_max;
+	      return true;
+	    }
+	  return false;
+	}
+
+      value_range_base vr0, vr1;
+      return (ranges_from_anti_range (this, &vr0, &vr1, true)
+	      && vr1.undefined_p ()
+	      && vr0.singleton_p (result));
+    }
   if (m_kind == VR_RANGE
       && vrp_operand_equal_p (min (), max ())
       && is_gimple_min_invariant (min ()))
@@ -510,23 +527,28 @@ static assert_locus **asserts_for;
 /* Return the maximum value for TYPE.  */
 
 tree
-vrp_val_max (const_tree type)
+vrp_val_max (const_tree type, bool handle_pointers)
 {
-  if (!INTEGRAL_TYPE_P (type))
-    return NULL_TREE;
-
-  return TYPE_MAX_VALUE (type);
+  if (INTEGRAL_TYPE_P (type))
+    return TYPE_MAX_VALUE (type);
+  if (POINTER_TYPE_P (type) && handle_pointers)
+    {
+      wide_int max = wi::max_value (TYPE_PRECISION (type), TYPE_SIGN (type));
+      return wide_int_to_tree (const_cast<tree> (type), max);
+    }
+  return NULL_TREE;
 }
 
 /* Return the minimum value for TYPE.  */
 
 tree
-vrp_val_min (const_tree type)
+vrp_val_min (const_tree type, bool handle_pointers)
 {
-  if (!INTEGRAL_TYPE_P (type))
-    return NULL_TREE;
-
-  return TYPE_MIN_VALUE (type);
+  if (INTEGRAL_TYPE_P (type))
+    return TYPE_MIN_VALUE (type);
+  if (POINTER_TYPE_P (type) && handle_pointers)
+    return build_zero_cst (const_cast<tree> (type));
+  return NULL_TREE;
 }
 
 /* Return whether VAL is equal to the maximum value of its type.
@@ -637,8 +659,7 @@ intersect_range_with_nonzero_bits (enum value_range_kind vr_type,
    extract ranges from var + CST op limit.  */
 
 void
-value_range_base::set_and_canonicalize (enum value_range_kind kind,
-					tree min, tree max)
+value_range_base::set (enum value_range_kind kind, tree min, tree max)
 {
   /* Use the canonical setters for VR_UNDEFINED and VR_VARYING.  */
   if (kind == VR_UNDEFINED)
@@ -652,11 +673,31 @@ value_range_base::set_and_canonicalize (enum value_range_kind kind,
       return;
     }
 
+  /* Convert POLY_INT_CST bounds into worst-case INTEGER_CST bounds.  */
+  if (POLY_INT_CST_P (min))
+    {
+      tree type_min = vrp_val_min (TREE_TYPE (min), true);
+      widest_int lb
+	= constant_lower_bound_with_limit (wi::to_poly_widest (min),
+					   wi::to_widest (type_min));
+      min = wide_int_to_tree (TREE_TYPE (min), lb);
+    }
+  if (POLY_INT_CST_P (max))
+    {
+      tree type_max = vrp_val_max (TREE_TYPE (max), true);
+      widest_int ub
+	= constant_upper_bound_with_limit (wi::to_poly_widest (max),
+					   wi::to_widest (type_max));
+      max = wide_int_to_tree (TREE_TYPE (max), ub);
+    }
+
   /* Nothing to canonicalize for symbolic ranges.  */
   if (TREE_CODE (min) != INTEGER_CST
       || TREE_CODE (max) != INTEGER_CST)
     {
-      set (kind, min, max);
+      m_kind = kind;
+      m_min = min;
+      m_max = max;
       return;
     }
 
@@ -692,12 +733,13 @@ value_range_base::set_and_canonicalize (enum value_range_kind kind,
       kind = kind == VR_RANGE ? VR_ANTI_RANGE : VR_RANGE;
     }
 
+  tree type = TREE_TYPE (min);
+
   /* Anti-ranges that can be represented as ranges should be so.  */
   if (kind == VR_ANTI_RANGE)
     {
       /* For -fstrict-enums we may receive out-of-range ranges so consider
          values < -INF and values > INF as -INF/INF as well.  */
-      tree type = TREE_TYPE (min);
       bool is_min = (INTEGRAL_TYPE_P (type)
 		     && tree_int_cst_compare (min, TYPE_MIN_VALUE (type)) <= 0);
       bool is_max = (INTEGRAL_TYPE_P (type)
@@ -740,22 +782,37 @@ value_range_base::set_and_canonicalize (enum value_range_kind kind,
         }
     }
 
+  /* Normalize [MIN, MAX] into VARYING and ~[MIN, MAX] into UNDEFINED.
+
+     Avoid using TYPE_{MIN,MAX}_VALUE because -fstrict-enums can
+     restrict those to a subset of what actually fits in the type.
+     Instead use the extremes of the type precision which will allow
+     compare_range_with_value() to check if a value is inside a range,
+     whereas if we used TYPE_*_VAL, said function would just punt
+     upon seeing a VARYING.  */
+  unsigned prec = TYPE_PRECISION (type);
+  signop sign = TYPE_SIGN (type);
+  if (wi::eq_p (wi::to_wide (min), wi::min_value (prec, sign))
+      && wi::eq_p (wi::to_wide (max), wi::max_value (prec, sign)))
+    {
+      if (kind == VR_RANGE)
+	set_varying ();
+      else if (kind == VR_ANTI_RANGE)
+	set_undefined ();
+      else
+	gcc_unreachable ();
+      return;
+    }
+
   /* Do not drop [-INF(OVF), +INF(OVF)] to varying.  (OVF) has to be sticky
      to make sure VRP iteration terminates, otherwise we can get into
      oscillations.  */
 
-  set (kind, min, max);
-}
-
-void
-value_range::set_and_canonicalize (enum value_range_kind kind,
-				   tree min, tree max, bitmap equiv)
-{
-  value_range_base::set_and_canonicalize (kind, min, max);
-  if (this->kind () == VR_RANGE || this->kind () == VR_ANTI_RANGE)
-    set_equiv (equiv);
-  else
-    equiv_clear ();
+  m_kind = kind;
+  m_min = min;
+  m_max = max;
+  if (flag_checking)
+    check ();
 }
 
 void
@@ -776,32 +833,19 @@ value_range::set (tree val)
   set (VR_RANGE, val, val, NULL);
 }
 
-/* Set value range VR to a non-NULL range of type TYPE.  */
+/* Set value range VR to a nonzero range of type TYPE.  */
 
 void
-value_range_base::set_nonnull (tree type)
+value_range_base::set_nonzero (tree type)
 {
   tree zero = build_int_cst (type, 0);
   set (VR_ANTI_RANGE, zero, zero);
 }
 
-void
-value_range::set_nonnull (tree type)
-{
-  tree zero = build_int_cst (type, 0);
-  set (VR_ANTI_RANGE, zero, zero, NULL);
-}
-
-/* Set value range VR to a NULL range of type TYPE.  */
+/* Set value range VR to a ZERO range of type TYPE.  */
 
 void
-value_range_base::set_null (tree type)
-{
-  set (build_int_cst (type, 0));
-}
-
-void
-value_range::set_null (tree type)
+value_range_base::set_zero (tree type)
 {
   set (build_int_cst (type, 0));
 }
@@ -830,22 +874,6 @@ vrp_bitmap_equal_p (const_bitmap b1, const_bitmap b2)
 	      && bitmap_equal_p (b1, b2)));
 }
 
-/* Return true if VR is [0, 0].  */
-
-static inline bool
-range_is_null (const value_range_base *vr)
-{
-  return vr->zero_p ();
-}
-
-static inline bool
-range_is_nonnull (const value_range_base *vr)
-{
-  return (vr->kind () == VR_ANTI_RANGE
-	  && vr->min () == vr->max ()
-	  && integer_zerop (vr->min ()));
-}
-
 /* Return true if max and min of VR are INTEGER_CST.  It's not necessary
    a singleton.  */
 
@@ -949,22 +977,17 @@ operand_less_p (tree val, tree val2)
   /* LT is folded faster than GE and others.  Inline the common case.  */
   if (TREE_CODE (val) == INTEGER_CST && TREE_CODE (val2) == INTEGER_CST)
     return tree_int_cst_lt (val, val2);
+  else if (TREE_CODE (val) == SSA_NAME && TREE_CODE (val2) == SSA_NAME)
+    return val == val2 ? 0 : -2;
   else
     {
-      tree tcmp;
-
-      fold_defer_overflow_warnings ();
-
-      tcmp = fold_binary_to_constant (LT_EXPR, boolean_type_node, val, val2);
-
-      fold_undefer_and_ignore_overflow_warnings ();
-
-      if (!tcmp
-	  || TREE_CODE (tcmp) != INTEGER_CST)
-	return -2;
-
-      if (!integer_zerop (tcmp))
+      int cmp = compare_values (val, val2);
+      if (cmp == -1)
 	return 1;
+      else if (cmp == 0 || cmp == 1)
+	return 0;
+      else
+	return -2;
     }
 
   return 0;
@@ -998,8 +1021,8 @@ compare_values_warnv (tree val1, tree val2, bool *strict_overflow_p)
 
   /* Convert the two values into the same type.  This is needed because
      sizetype causes sign extension even for unsigned types.  */
-  val2 = fold_convert (TREE_TYPE (val1), val2);
-  STRIP_USELESS_TYPE_CONVERSION (val2);
+  if (!useless_type_conversion_p (TREE_TYPE (val1), TREE_TYPE (val2)))
+    val2 = fold_convert (TREE_TYPE (val1), val2);
 
   const bool overflow_undefined
     = INTEGRAL_TYPE_P (TREE_TYPE (val1))
@@ -1107,32 +1130,43 @@ compare_values_warnv (tree val1, tree val2, bool *strict_overflow_p)
     }
   else
     {
-      tree t;
+      if (TREE_CODE (val1) == INTEGER_CST && TREE_CODE (val2) == INTEGER_CST)
+	{
+	  /* We cannot compare overflowed values.  */
+	  if (TREE_OVERFLOW (val1) || TREE_OVERFLOW (val2))
+	    return -2;
+
+	  return tree_int_cst_compare (val1, val2);
+	}
 
       /* First see if VAL1 and VAL2 are not the same.  */
-      if (val1 == val2 || operand_equal_p (val1, val2, 0))
+      if (operand_equal_p (val1, val2, 0))
 	return 0;
 
+      fold_defer_overflow_warnings ();
+
       /* If VAL1 is a lower address than VAL2, return -1.  */
-      if (operand_less_p (val1, val2) == 1)
-	return -1;
+      tree t = fold_binary_to_constant (LT_EXPR, boolean_type_node, val1, val2);
+      if (t && integer_onep (t))
+	{
+	  fold_undefer_and_ignore_overflow_warnings ();
+	  return -1;
+	}
 
       /* If VAL1 is a higher address than VAL2, return +1.  */
-      if (operand_less_p (val2, val1) == 1)
-	return 1;
-
-      /* If VAL1 is different than VAL2, return +2.
-	 For integer constants we either have already returned -1 or 1
-	 or they are equivalent.  We still might succeed in proving
-	 something about non-trivial operands.  */
-      if (TREE_CODE (val1) != INTEGER_CST
-	  || TREE_CODE (val2) != INTEGER_CST)
+      t = fold_binary_to_constant (LT_EXPR, boolean_type_node, val2, val1);
+      if (t && integer_onep (t))
 	{
-          t = fold_binary_to_constant (NE_EXPR, boolean_type_node, val1, val2);
-	  if (t && integer_onep (t))
-	    return 2;
+	  fold_undefer_and_ignore_overflow_warnings ();
+	  return 1;
 	}
 
+      /* If VAL1 is different than VAL2, return +2.  */
+      t = fold_binary_to_constant (NE_EXPR, boolean_type_node, val1, val2);
+      fold_undefer_and_ignore_overflow_warnings ();
+      if (t && integer_onep (t))
+	return 2;
+
       return -2;
     }
 }
@@ -1231,7 +1265,8 @@ vrp_set_zero_nonzero_bits (const tree expr_type,
 
 static bool
 ranges_from_anti_range (const value_range_base *ar,
-			value_range_base *vr0, value_range_base *vr1)
+			value_range_base *vr0, value_range_base *vr1,
+			bool handle_pointers)
 {
   tree type = ar->type ();
 
@@ -1244,18 +1279,18 @@ ranges_from_anti_range (const value_range_base *ar,
   if (ar->kind () != VR_ANTI_RANGE
       || TREE_CODE (ar->min ()) != INTEGER_CST
       || TREE_CODE (ar->max ()) != INTEGER_CST
-      || !vrp_val_min (type)
-      || !vrp_val_max (type))
+      || !vrp_val_min (type, handle_pointers)
+      || !vrp_val_max (type, handle_pointers))
     return false;
 
-  if (tree_int_cst_lt (vrp_val_min (type), ar->min ()))
+  if (tree_int_cst_lt (vrp_val_min (type, handle_pointers), ar->min ()))
     vr0->set (VR_RANGE,
-	      vrp_val_min (type),
+	      vrp_val_min (type, handle_pointers),
 	      wide_int_to_tree (type, wi::to_wide (ar->min ()) - 1));
-  if (tree_int_cst_lt (ar->max (), vrp_val_max (type)))
+  if (tree_int_cst_lt (ar->max (), vrp_val_max (type, handle_pointers)))
     vr1->set (VR_RANGE,
 	      wide_int_to_tree (type, wi::to_wide (ar->max ()) + 1),
-	      vrp_val_max (type));
+	      vrp_val_max (type, handle_pointers));
   if (vr0->undefined_p ())
     {
       *vr0 = *vr1;
@@ -1266,21 +1301,20 @@ ranges_from_anti_range (const value_range_base *ar,
 }
 
 /* Extract the components of a value range into a pair of wide ints in
-   [WMIN, WMAX].
-
-   If the value range is anything but a VR_*RANGE of constants, the
-   resulting wide ints are set to [-MIN, +MAX] for the type.  */
+   [WMIN, WMAX], after having normalized any symbolics from the input.  */
 
 static void inline
-extract_range_into_wide_ints (const value_range_base *vr,
-			      signop sign, unsigned prec,
-			      wide_int &wmin, wide_int &wmax)
+extract_range_into_wide_ints (const value_range_base *vr_,
+			      tree type, wide_int &wmin, wide_int &wmax)
 {
-  gcc_assert (vr->kind () != VR_ANTI_RANGE || vr->symbolic_p ());
-  if (range_int_cst_p (vr))
+  signop sign = TYPE_SIGN (type);
+  unsigned int prec = TYPE_PRECISION (type);
+  gcc_assert (vr_->kind () != VR_ANTI_RANGE || vr_->symbolic_p ());
+  value_range vr = vr_->normalize_symbolics ();
+  if (range_int_cst_p (&vr))
     {
-      wmin = wi::to_wide (vr->min ());
-      wmax = wi::to_wide (vr->max ());
+      wmin = wi::to_wide (vr.min ());
+      wmax = wi::to_wide (vr.max ());
     }
   else
     {
@@ -1295,7 +1329,7 @@ extract_range_into_wide_ints (const value_range_base *vr,
 
 static void
 extract_range_from_multiplicative_op (value_range_base *vr,
-				      enum tree_code code,
+				      enum tree_code code, tree type,
 				      const value_range_base *vr0,
 				      const value_range_base *vr1)
 {
@@ -1307,13 +1341,31 @@ extract_range_from_multiplicative_op (value_range_base *vr,
 	      || code == ROUND_DIV_EXPR
 	      || code == RSHIFT_EXPR
 	      || code == LSHIFT_EXPR);
-  gcc_assert (vr0->kind () == VR_RANGE
-	      && vr0->kind () == vr1->kind ());
+  if (!range_int_cst_p (vr1))
+    {
+      vr->set_varying ();
+      return;
+    }
+
+  /* Even if vr0 is VARYING or otherwise not usable, we can derive
+     useful ranges just from the shift count.  E.g.
+     x >> 63 for signed 64-bit x is always [-1, 0].  */
+  value_range_base tem = vr0->normalize_symbolics ();
+  tree vr0_min, vr0_max;
+  if (tem.kind () == VR_RANGE)
+    {
+      vr0_min = tem.min ();
+      vr0_max = tem.max ();
+    }
+  else
+    {
+      vr0_min = vrp_val_min (type);
+      vr0_max = vrp_val_max (type);
+    }
 
-  tree type = vr0->type ();
   wide_int res_lb, res_ub;
-  wide_int vr0_lb = wi::to_wide (vr0->min ());
-  wide_int vr0_ub = wi::to_wide (vr0->max ());
+  wide_int vr0_lb = wi::to_wide (vr0_min);
+  wide_int vr0_ub = wi::to_wide (vr0_max);
   wide_int vr1_lb = wi::to_wide (vr1->min ());
   wide_int vr1_ub = wi::to_wide (vr1->max ());
   bool overflow_undefined = TYPE_OVERFLOW_UNDEFINED (type);
@@ -1323,9 +1375,8 @@ extract_range_from_multiplicative_op (value_range_base *vr,
 					code, TYPE_SIGN (type), prec,
 					vr0_lb, vr0_ub, vr1_lb, vr1_ub,
 					overflow_undefined))
-    vr->set_and_canonicalize (VR_RANGE,
-			      wide_int_to_tree (type, res_lb),
-			      wide_int_to_tree (type, res_ub));
+    vr->set (VR_RANGE, wide_int_to_tree (type, res_lb),
+	     wide_int_to_tree (type, res_ub));
   else
     vr->set_varying ();
 }
@@ -1583,9 +1634,9 @@ extract_range_from_binary_expr (value_range_base *vr,
      code is EXACT_DIV_EXPR.  We could mask out bits in the resulting
      range, but then we also need to hack up vrp_union.  It's just
      easier to special case when vr0 is ~[0,0] for EXACT_DIV_EXPR.  */
-  if (code == EXACT_DIV_EXPR && range_is_nonnull (&vr0))
+  if (code == EXACT_DIV_EXPR && vr0.nonzero_p ())
     {
-      vr->set_nonnull (expr_type);
+      vr->set_nonzero (expr_type);
       return;
     }
 
@@ -1663,9 +1714,9 @@ extract_range_from_binary_expr (value_range_base *vr,
 	     If both are null, then the result is null. Otherwise they
 	     are varying.  */
 	  if (!range_includes_zero_p (&vr0) && !range_includes_zero_p (&vr1))
-	    vr->set_nonnull (expr_type);
-	  else if (range_is_null (&vr0) && range_is_null (&vr1))
-	    vr->set_null (expr_type);
+	    vr->set_nonzero (expr_type);
+	  else if (vr0.zero_p () && vr1.zero_p ())
+	    vr->set_zero (expr_type);
 	  else
 	    vr->set_varying ();
 	}
@@ -1692,9 +1743,9 @@ extract_range_from_binary_expr (value_range_base *vr,
 	      && (flag_delete_null_pointer_checks
 		  || (range_int_cst_p (&vr1)
 		      && !tree_int_cst_sign_bit (vr1.max ()))))
-	    vr->set_nonnull (expr_type);
-	  else if (range_is_null (&vr0) && range_is_null (&vr1))
-	    vr->set_null (expr_type);
+	    vr->set_nonzero (expr_type);
+	  else if (vr0.zero_p () && vr1.zero_p ())
+	    vr->set_zero (expr_type);
 	  else
 	    vr->set_varying ();
 	}
@@ -1702,8 +1753,8 @@ extract_range_from_binary_expr (value_range_base *vr,
 	{
 	  /* For pointer types, we are really only interested in asserting
 	     whether the expression evaluates to non-NULL.  */
-	  if (range_is_null (&vr0) || range_is_null (&vr1))
-	    vr->set_null (expr_type);
+	  if (vr0.zero_p () || vr1.zero_p ())
+	    vr->set_zero (expr_type);
 	  else
 	    vr->set_varying ();
 	}
@@ -1717,19 +1768,30 @@ extract_range_from_binary_expr (value_range_base *vr,
      range and see what we end up with.  */
   if (code == PLUS_EXPR || code == MINUS_EXPR)
     {
+      value_range_kind vr0_kind = vr0.kind (), vr1_kind = vr1.kind ();
+      tree vr0_min = vr0.min (), vr0_max = vr0.max ();
+      tree vr1_min = vr1.min (), vr1_max = vr1.max ();
       /* This will normalize things such that calculating
 	 [0,0] - VR_VARYING is not dropped to varying, but is
 	 calculated as [MIN+1, MAX].  */
       if (vr0.varying_p ())
-	vr0.set (VR_RANGE, vrp_val_min (expr_type), vrp_val_max (expr_type));
+	{
+	  vr0_kind = VR_RANGE;
+	  vr0_min = vrp_val_min (expr_type);
+	  vr0_max = vrp_val_max (expr_type);
+	}
       if (vr1.varying_p ())
-	vr1.set (VR_RANGE, vrp_val_min (expr_type), vrp_val_max (expr_type));
+	{
+	  vr1_kind = VR_RANGE;
+	  vr1_min = vrp_val_min (expr_type);
+	  vr1_max = vrp_val_max (expr_type);
+	}
 
       const bool minus_p = (code == MINUS_EXPR);
-      tree min_op0 = vr0.min ();
-      tree min_op1 = minus_p ? vr1.max () : vr1.min ();
-      tree max_op0 = vr0.max ();
-      tree max_op1 = minus_p ? vr1.min () : vr1.max ();
+      tree min_op0 = vr0_min;
+      tree min_op1 = minus_p ? vr1_max : vr1_min;
+      tree max_op0 = vr0_max;
+      tree max_op1 = minus_p ? vr1_min : vr1_max;
       tree sym_min_op0 = NULL_TREE;
       tree sym_min_op1 = NULL_TREE;
       tree sym_max_op0 = NULL_TREE;
@@ -1742,7 +1804,7 @@ extract_range_from_binary_expr (value_range_base *vr,
 	 single-symbolic ranges, try to compute the precise resulting range,
 	 but only if we know that this resulting range will also be constant
 	 or single-symbolic.  */
-      if (vr0.kind () == VR_RANGE && vr1.kind () == VR_RANGE
+      if (vr0_kind == VR_RANGE && vr1_kind == VR_RANGE
 	  && (TREE_CODE (min_op0) == INTEGER_CST
 	      || (sym_min_op0
 		  = get_single_symbol (min_op0, &neg_min_op0, &min_op0)))
@@ -1823,8 +1885,8 @@ extract_range_from_binary_expr (value_range_base *vr,
       wide_int wmin, wmax;
       wide_int vr0_min, vr0_max;
       wide_int vr1_min, vr1_max;
-      extract_range_into_wide_ints (&vr0, sign, prec, vr0_min, vr0_max);
-      extract_range_into_wide_ints (&vr1, sign, prec, vr1_min, vr1_max);
+      extract_range_into_wide_ints (&vr0, expr_type, vr0_min, vr0_max);
+      extract_range_into_wide_ints (&vr1, expr_type, vr1_min, vr1_max);
       if (wide_int_range_min_max (wmin, wmax, code, sign, prec,
 				  vr0_min, vr0_max, vr1_min, vr1_max))
 	vr->set (VR_RANGE, wide_int_to_tree (expr_type, wmin),
@@ -1841,7 +1903,7 @@ extract_range_from_binary_expr (value_range_base *vr,
 	  vr->set_varying ();
 	  return;
 	}
-      extract_range_from_multiplicative_op (vr, code, &vr0, &vr1);
+      extract_range_from_multiplicative_op (vr, code, expr_type, &vr0, &vr1);
       return;
     }
   else if (code == RSHIFT_EXPR
@@ -1856,13 +1918,8 @@ extract_range_from_binary_expr (value_range_base *vr,
 	{
 	  if (code == RSHIFT_EXPR)
 	    {
-	      /* Even if vr0 is VARYING or otherwise not usable, we can derive
-		 useful ranges just from the shift count.  E.g.
-		 x >> 63 for signed 64-bit x is always [-1, 0].  */
-	      if (vr0.kind () != VR_RANGE || vr0.symbolic_p ())
-		vr0.set (VR_RANGE, vrp_val_min (expr_type),
-			 vrp_val_max (expr_type));
-	      extract_range_from_multiplicative_op (vr, code, &vr0, &vr1);
+	      extract_range_from_multiplicative_op (vr, code, expr_type,
+						    &vr0, &vr1);
 	      return;
 	    }
 	  else if (code == LSHIFT_EXPR
@@ -1878,7 +1935,7 @@ extract_range_from_binary_expr (value_range_base *vr,
 		{
 		  min = wide_int_to_tree (expr_type, res_lb);
 		  max = wide_int_to_tree (expr_type, res_ub);
-		  vr->set_and_canonicalize (VR_RANGE, min, max);
+		  vr->set (VR_RANGE, min, max);
 		  return;
 		}
 	    }
@@ -1897,7 +1954,7 @@ extract_range_from_binary_expr (value_range_base *vr,
       bool extra_range_p;
 
       /* Special case explicit division by zero as undefined.  */
-      if (range_is_null (&vr1))
+      if (vr1.zero_p ())
 	{
 	  vr->set_undefined ();
 	  return;
@@ -1910,9 +1967,9 @@ extract_range_from_binary_expr (value_range_base *vr,
 	 NOTE: As a future improvement, we may be able to do better
 	 with mixed symbolic (anti-)ranges like [0, A].  See note in
 	 ranges_from_anti_range.  */
-      extract_range_into_wide_ints (&vr0, sign, prec,
+      extract_range_into_wide_ints (&vr0, expr_type,
 				    dividend_min, dividend_max);
-      extract_range_into_wide_ints (&vr1, sign, prec,
+      extract_range_into_wide_ints (&vr1, expr_type,
 				    divisor_min, divisor_max);
       if (!wide_int_range_div (wmin, wmax, code, sign, prec,
 			       dividend_min, dividend_max,
@@ -1936,15 +1993,15 @@ extract_range_from_binary_expr (value_range_base *vr,
     }
   else if (code == TRUNC_MOD_EXPR)
     {
-      if (range_is_null (&vr1))
+      if (vr1.zero_p ())
 	{
 	  vr->set_undefined ();
 	  return;
 	}
       wide_int wmin, wmax, tmp;
       wide_int vr0_min, vr0_max, vr1_min, vr1_max;
-      extract_range_into_wide_ints (&vr0, sign, prec, vr0_min, vr0_max);
-      extract_range_into_wide_ints (&vr1, sign, prec, vr1_min, vr1_max);
+      extract_range_into_wide_ints (&vr0, expr_type, vr0_min, vr0_max);
+      extract_range_into_wide_ints (&vr1, expr_type, vr1_min, vr1_max);
       wide_int_range_trunc_mod (wmin, wmax, sign, prec,
 				vr0_min, vr0_max, vr1_min, vr1_max);
       min = wide_int_to_tree (expr_type, wmin);
@@ -1962,8 +2019,8 @@ extract_range_from_binary_expr (value_range_base *vr,
 				 &may_be_nonzero0, &must_be_nonzero0);
       vrp_set_zero_nonzero_bits (expr_type, &vr1,
 				 &may_be_nonzero1, &must_be_nonzero1);
-      extract_range_into_wide_ints (&vr0, sign, prec, vr0_min, vr0_max);
-      extract_range_into_wide_ints (&vr1, sign, prec, vr1_min, vr1_max);
+      extract_range_into_wide_ints (&vr0, expr_type, vr0_min, vr0_max);
+      extract_range_into_wide_ints (&vr1, expr_type, vr1_min, vr1_max);
       if (code == BIT_AND_EXPR)
 	{
 	  if (wide_int_range_bit_and (wmin, wmax, sign, prec,
@@ -2140,9 +2197,9 @@ extract_range_from_unary_expr (value_range_base *vr,
       if (POINTER_TYPE_P (type) || POINTER_TYPE_P (op0_type))
 	{
 	  if (!range_includes_zero_p (&vr0))
-	    vr->set_nonnull (type);
-	  else if (range_is_null (&vr0))
-	    vr->set_null (type);
+	    vr->set_nonzero (type);
+	  else if (vr0.zero_p ())
+	    vr->set_zero (type);
 	  else
 	    vr->set_varying ();
 	  return;
@@ -2167,8 +2224,7 @@ extract_range_from_unary_expr (value_range_base *vr,
       signop outer_sign = TYPE_SIGN (outer_type);
       unsigned inner_prec = TYPE_PRECISION (inner_type);
       unsigned outer_prec = TYPE_PRECISION (outer_type);
-      extract_range_into_wide_ints (&vr0, inner_sign, inner_prec,
-				    vr0_min, vr0_max);
+      extract_range_into_wide_ints (&vr0, inner_type, vr0_min, vr0_max);
       if (wide_int_range_convert (wmin, wmax,
 				  inner_sign, inner_prec,
 				  outer_sign, outer_prec,
@@ -2176,7 +2232,7 @@ extract_range_from_unary_expr (value_range_base *vr,
 	{
 	  tree min = wide_int_to_tree (outer_type, wmin);
 	  tree max = wide_int_to_tree (outer_type, wmax);
-	  vr->set_and_canonicalize (VR_RANGE, min, max);
+	  vr->set (VR_RANGE, min, max);
 	}
       else
 	vr->set_varying ();
@@ -2186,7 +2242,7 @@ extract_range_from_unary_expr (value_range_base *vr,
     {
       wide_int wmin, wmax;
       wide_int vr0_min, vr0_max;
-      extract_range_into_wide_ints (&vr0, sign, prec, vr0_min, vr0_max);
+      extract_range_into_wide_ints (&vr0, type, vr0_min, vr0_max);
       if (wide_int_range_abs (wmin, wmax, sign, prec, vr0_min, vr0_max,
 			      TYPE_OVERFLOW_UNDEFINED (type)))
 	vr->set (VR_RANGE, wide_int_to_tree (type, wmin),
@@ -2199,7 +2255,8 @@ extract_range_from_unary_expr (value_range_base *vr,
     {
       wide_int wmin, wmax;
       wide_int vr0_min, vr0_max;
-      extract_range_into_wide_ints (&vr0, SIGNED, prec, vr0_min, vr0_max);
+      tree signed_type = make_signed_type (TYPE_PRECISION (type));
+      extract_range_into_wide_ints (&vr0, signed_type, vr0_min, vr0_max);
       wide_int_range_absu (wmin, wmax, prec, vr0_min, vr0_max);
       vr->set (VR_RANGE, wide_int_to_tree (type, wmin),
 	       wide_int_to_tree (type, wmax));
@@ -5468,8 +5525,10 @@ union_ranges (enum value_range_kind *vr0type,
 	      enum value_range_kind vr1type,
 	      tree vr1min, tree vr1max)
 {
-  bool mineq = vrp_operand_equal_p (*vr0min, vr1min);
-  bool maxeq = vrp_operand_equal_p (*vr0max, vr1max);
+  int cmpmin = compare_values (*vr0min, vr1min);
+  int cmpmax = compare_values (*vr0max, vr1max);
+  bool mineq = cmpmin == 0;
+  bool maxeq = cmpmax == 0;
 
   /* [] is vr0, () is vr1 in the following classification comments.  */
   if (mineq && maxeq)
@@ -5569,8 +5628,8 @@ union_ranges (enum value_range_kind *vr0type,
       else
 	gcc_unreachable ();
     }
-  else if ((maxeq || operand_less_p (vr1max, *vr0max) == 1)
-	   && (mineq || operand_less_p (*vr0min, vr1min) == 1))
+  else if ((maxeq || cmpmax == 1)
+	   && (mineq || cmpmin == -1))
     {
       /* [ (  ) ] or [(  ) ] or [ (  )] */
       if (*vr0type == VR_RANGE
@@ -5603,8 +5662,8 @@ union_ranges (enum value_range_kind *vr0type,
       else
 	gcc_unreachable ();
     }
-  else if ((maxeq || operand_less_p (*vr0max, vr1max) == 1)
-	   && (mineq || operand_less_p (vr1min, *vr0min) == 1))
+  else if ((maxeq || cmpmax == -1)
+	   && (mineq || cmpmin == 1))
     {
       /* ( [  ] ) or ([  ] ) or ( [  ]) */
       if (*vr0type == VR_RANGE
@@ -5643,10 +5702,10 @@ union_ranges (enum value_range_kind *vr0type,
       else
 	gcc_unreachable ();
     }
-  else if ((operand_less_p (vr1min, *vr0max) == 1
-	    || operand_equal_p (vr1min, *vr0max, 0))
-	   && operand_less_p (*vr0min, vr1min) == 1
-	   && operand_less_p (*vr0max, vr1max) == 1)
+  else if (cmpmin == -1
+	   && cmpmax == -1
+	   && (operand_less_p (vr1min, *vr0max) == 1
+	       || operand_equal_p (vr1min, *vr0max, 0)))
     {
       /* [  (  ]  ) or [   ](   ) */
       if (*vr0type == VR_RANGE
@@ -5680,10 +5739,10 @@ union_ranges (enum value_range_kind *vr0type,
       else
 	gcc_unreachable ();
     }
-  else if ((operand_less_p (*vr0min, vr1max) == 1
-	    || operand_equal_p (*vr0min, vr1max, 0))
-	   && operand_less_p (vr1min, *vr0min) == 1
-	   && operand_less_p (vr1max, *vr0max) == 1)
+  else if (cmpmin == 1
+	   && cmpmax == 1
+	   && (operand_less_p (*vr0min, vr1max) == 1
+	       || operand_equal_p (*vr0min, vr1max, 0)))
     {
       /* (  [  )  ] or (   )[   ] */
       if (*vr0type == VR_RANGE
@@ -6083,7 +6142,7 @@ value_range::intersect_helper (value_range *vr0, const value_range *vr1)
      VR_RANGE can still be a VR_RANGE.  Work on a temporary so we can
      fall back to vr0 when this turns things to varying.  */
   value_range tem;
-  tem.set_and_canonicalize (vr0type, vr0min, vr0max);
+  tem.set (vr0type, vr0min, vr0max);
   /* If that failed, use the saved original VR0.  */
   if (tem.varying_p ())
     return;
@@ -6152,8 +6211,8 @@ value_range_base::union_helper (const value_range_base *vr0,
 		vr1->kind (), vr1->min (), vr1->max ());
 
   /* Work on a temporary so we can still use vr0 when union returns varying.  */
-  value_range tem;
-  tem.set_and_canonicalize (vr0type, vr0min, vr0max);
+  value_range_base tem;
+  tem.set (vr0type, vr0min, vr0max);
 
   /* Failed to find an efficient meet.  Before giving up and setting
      the result to VARYING, see if we can at least derive a useful
@@ -6162,7 +6221,7 @@ value_range_base::union_helper (const value_range_base *vr0,
       && range_includes_zero_p (vr0) == 0
       && range_includes_zero_p (vr1) == 0)
     {
-      tem.set_nonnull (vr0->type ());
+      tem.set_nonzero (vr0->type ());
       return tem;
     }
 
@@ -6233,6 +6292,58 @@ value_range::union_ (const value_range *other)
     }
 }
 
+/* Normalize symbolics into constants.  */
+
+value_range_base
+value_range_base::normalize_symbolics () const
+{
+  if (varying_p () || undefined_p ())
+    return *this;
+  tree ttype = type ();
+  bool min_symbolic = !is_gimple_min_invariant (min ());
+  bool max_symbolic = !is_gimple_min_invariant (max ());
+  if (!min_symbolic && !max_symbolic)
+    return *this;
+
+  // [SYM, SYM] -> VARYING
+  if (min_symbolic && max_symbolic)
+    {
+      value_range_base var;
+      var.set_varying ();
+      return var;
+    }
+  if (kind () == VR_RANGE)
+    {
+      // [SYM, NUM] -> [-MIN, NUM]
+      if (min_symbolic)
+	return value_range_base (VR_RANGE, vrp_val_min (ttype), max ());
+      // [NUM, SYM] -> [NUM, +MAX]
+      return value_range_base (VR_RANGE, min (), vrp_val_max (ttype));
+    }
+  gcc_assert (kind () == VR_ANTI_RANGE);
+  // ~[SYM, NUM] -> [NUM + 1, +MAX]
+  if (min_symbolic)
+    {
+      if (!vrp_val_is_max (max ()))
+	{
+	  tree n = wide_int_to_tree (ttype, wi::to_wide (max ()) + 1);
+	  return value_range_base (VR_RANGE, n, vrp_val_max (ttype));
+	}
+      value_range_base var;
+      var.set_varying ();
+      return var;
+    }
+  // ~[NUM, SYM] -> [-MIN, NUM - 1]
+  if (!vrp_val_is_min (min ()))
+    {
+      tree n = wide_int_to_tree (ttype, wi::to_wide (min ()) - 1);
+      return value_range_base (VR_RANGE, vrp_val_min (ttype), n);
+    }
+  value_range_base var;
+  var.set_varying ();
+  return var;
+}
+
 /* Visit all arguments for PHI node PHI that flow through executable
    edges.  If a valid value range can be derived from all the incoming
    value ranges, set a new range for the LHS of PHI.  */
diff --git a/gcc/tree-vrp.h b/gcc/tree-vrp.h
index 9d52b428d05..4bcff924b58 100644
--- a/gcc/tree-vrp.h
+++ b/gcc/tree-vrp.h
@@ -46,8 +46,8 @@ public:
 
   void set (value_range_kind, tree, tree);
   void set (tree);
-  void set_nonnull (tree);
-  void set_null (tree);
+  void set_nonzero (tree);
+  void set_zero (tree);
 
   enum value_range_kind kind () const;
   tree min () const;
@@ -70,11 +70,13 @@ public:
   /* Misc methods.  */
   tree type () const;
   bool may_contain_p (tree) const;
-  void set_and_canonicalize (enum value_range_kind, tree, tree);
   bool zero_p () const;
+  bool nonzero_p () const;
   bool singleton_p (tree *result = NULL) const;
   void dump (FILE *) const;
 
+  value_range_base normalize_symbolics () const;
+
 protected:
   void check ();
   static value_range_base union_helper (const value_range_base *,
@@ -118,8 +120,6 @@ class GTY((user)) value_range : public value_range_base
   /* Deep-copies equiv bitmap argument.  */
   void set (value_range_kind, tree, tree, bitmap = NULL);
   void set (tree);
-  void set_nonnull (tree);
-  void set_null (tree);
 
   bool operator== (const value_range &) const /* = delete */;
   bool operator!= (const value_range &) const /* = delete */;
@@ -138,7 +138,6 @@ class GTY((user)) value_range : public value_range_base
 
   /* Misc methods.  */
   void deep_copy (const value_range *);
-  void set_and_canonicalize (enum value_range_kind, tree, tree, bitmap = NULL);
   void dump (FILE *) const;
 
  private:
@@ -222,6 +221,16 @@ value_range_base::zero_p () const
 	  && integer_zerop (m_max));
 }
 
+/* Return TRUE if range is nonzero.  */
+
+inline bool
+value_range_base::nonzero_p () const
+{
+  return (m_kind == VR_ANTI_RANGE
+	  && integer_zerop (m_min)
+	  && integer_zerop (m_max));
+}
+
 extern void dump_value_range (FILE *, const value_range *);
 extern void dump_value_range (FILE *, const value_range_base *);
 
@@ -259,8 +268,8 @@ extern bool vrp_val_is_min (const_tree);
 extern bool vrp_val_is_max (const_tree);
 extern int value_inside_range (tree, tree, tree);
 
-extern tree vrp_val_min (const_tree);
-extern tree vrp_val_max (const_tree);
+extern tree vrp_val_min (const_tree, bool handle_pointers = false);
+extern tree vrp_val_max (const_tree, bool handle_pointers = false);
 
 extern void extract_range_from_unary_expr (value_range_base *vr,
 					   enum tree_code code,
diff --git a/gcc/tree.c b/gcc/tree.c
index 32e94e48132..c4b8eea675f 100644
--- a/gcc/tree.c
+++ b/gcc/tree.c
@@ -8213,8 +8213,6 @@ build_nonstandard_integer_type (unsigned HOST_WIDE_INT precision,
   else
     fixup_signed_type (itype);
 
-  ret = itype;
-
   inchash::hash hstate;
   inchash::add_expr (TYPE_MAX_VALUE (itype), hstate);
   ret = type_hash_canon (hstate.end (), itype);
@@ -11079,44 +11077,44 @@ build_vector_type (tree innertype, poly_int64 nunits)
   return make_vector_type (innertype, nunits, VOIDmode);
 }
 
-/* Build truth vector with specified length and number of units.  */
+/* Build a truth vector with NUNITS units, giving it mode MASK_MODE.  */
 
 tree
-build_truth_vector_type (poly_uint64 nunits, poly_uint64 vector_size)
+build_truth_vector_type_for_mode (poly_uint64 nunits, machine_mode mask_mode)
 {
-  machine_mode mask_mode
-    = targetm.vectorize.get_mask_mode (nunits, vector_size).else_blk ();
-
-  poly_uint64 vsize;
-  if (mask_mode == BLKmode)
-    vsize = vector_size * BITS_PER_UNIT;
-  else
-    vsize = GET_MODE_BITSIZE (mask_mode);
+  gcc_assert (mask_mode != BLKmode);
 
+  poly_uint64 vsize = GET_MODE_BITSIZE (mask_mode);
   unsigned HOST_WIDE_INT esize = vector_element_size (vsize, nunits);
-
   tree bool_type = build_nonstandard_boolean_type (esize);
 
   return make_vector_type (bool_type, nunits, mask_mode);
 }
 
-/* Returns a vector type corresponding to a comparison of VECTYPE.  */
+/* Build a vector type that holds one boolean result for each element of
+   vector type VECTYPE.  The public interface for this operation is
+   truth_type_for.  */
 
-tree
-build_same_sized_truth_vector_type (tree vectype)
+static tree
+build_truth_vector_type_for (tree vectype)
 {
-  if (VECTOR_BOOLEAN_TYPE_P (vectype))
-    return vectype;
+  machine_mode vector_mode = TYPE_MODE (vectype);
+  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
 
-  poly_uint64 size = GET_MODE_SIZE (TYPE_MODE (vectype));
+  machine_mode mask_mode;
+  if (VECTOR_MODE_P (vector_mode)
+      && targetm.vectorize.get_mask_mode (vector_mode).exists (&mask_mode))
+    return build_truth_vector_type_for_mode (nunits, mask_mode);
 
-  if (known_eq (size, 0U))
-    size = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
+  poly_uint64 vsize = tree_to_poly_uint64 (TYPE_SIZE (vectype));
+  unsigned HOST_WIDE_INT esize = vector_element_size (vsize, nunits);
+  tree bool_type = build_nonstandard_boolean_type (esize);
 
-  return build_truth_vector_type (TYPE_VECTOR_SUBPARTS (vectype), size);
+  return make_vector_type (bool_type, nunits, BLKmode);
 }
 
-/* Similarly, but builds a variant type with TYPE_VECTOR_OPAQUE set.  */
+/* Like build_vector_type, but builds a variant type with TYPE_VECTOR_OPAQUE
+   set.  */
 
 tree
 build_opaque_vector_type (tree innertype, poly_int64 nunits)
@@ -11915,8 +11913,7 @@ truth_type_for (tree type)
     {
       if (VECTOR_BOOLEAN_TYPE_P (type))
 	return type;
-      return build_truth_vector_type (TYPE_VECTOR_SUBPARTS (type),
-				      GET_MODE_SIZE (TYPE_MODE (type)));
+      return build_truth_vector_type_for (type);
     }
   else
     return boolean_type_node;
diff --git a/gcc/tree.h b/gcc/tree.h
index 2f8e37bb356..6f73593faa7 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -4272,8 +4272,7 @@ extern tree build_reference_type_for_mode (tree, machine_mode, bool);
 extern tree build_reference_type (tree);
 extern tree build_vector_type_for_mode (tree, machine_mode);
 extern tree build_vector_type (tree, poly_int64);
-extern tree build_truth_vector_type (poly_uint64, poly_uint64);
-extern tree build_same_sized_truth_vector_type (tree vectype);
+extern tree build_truth_vector_type_for_mode (poly_uint64, machine_mode);
 extern tree build_opaque_vector_type (tree, poly_int64);
 extern tree build_index_type (tree);
 extern tree build_array_type (tree, tree, bool = false);
diff --git a/gcc/vr-values.c b/gcc/vr-values.c
index 0e10aca92bb..02c89ab030a 100644
--- a/gcc/vr-values.c
+++ b/gcc/vr-values.c
@@ -118,7 +118,10 @@ vr_values::get_value_range (const_tree var)
 	  if (POINTER_TYPE_P (TREE_TYPE (sym))
 	      && (nonnull_arg_p (sym)
 		  || get_ptr_nonnull (var)))
-	    vr->set_nonnull (TREE_TYPE (sym));
+	    {
+	      vr->set_nonzero (TREE_TYPE (sym));
+	      vr->equiv_clear ();
+	    }
 	  else if (INTEGRAL_TYPE_P (TREE_TYPE (sym)))
 	    {
 	      get_range_info (var, *vr);
@@ -130,7 +133,10 @@ vr_values::get_value_range (const_tree var)
 	}
       else if (TREE_CODE (sym) == RESULT_DECL
 	       && DECL_BY_REFERENCE (sym))
-	vr->set_nonnull (TREE_TYPE (sym));
+	{
+	  vr->set_nonzero (TREE_TYPE (sym));
+	  vr->equiv_clear ();
+	}
     }
 
   return vr;
@@ -491,9 +497,9 @@ vr_values::extract_range_for_var_from_comparison_expr (tree var,
          vice-versa.  Use set_and_canonicalize which does this for
          us.  */
       if (cond_code == LE_EXPR)
-        vr_p->set_and_canonicalize (VR_RANGE, min, max, vr_p->equiv ());
+        vr_p->set (VR_RANGE, min, max, vr_p->equiv ());
       else if (cond_code == GT_EXPR)
-        vr_p->set_and_canonicalize (VR_ANTI_RANGE, min, max, vr_p->equiv ());
+        vr_p->set (VR_ANTI_RANGE, min, max, vr_p->equiv ());
       else
 	gcc_unreachable ();
     }
@@ -565,7 +571,7 @@ vr_values::extract_range_for_var_from_comparison_expr (tree var,
 	  && vrp_val_is_max (max))
 	min = max = limit;
 
-      vr_p->set_and_canonicalize (VR_ANTI_RANGE, min, max, vr_p->equiv ());
+      vr_p->set (VR_ANTI_RANGE, min, max, vr_p->equiv ());
     }
   else if (cond_code == LE_EXPR || cond_code == LT_EXPR)
     {
@@ -858,7 +864,10 @@ vr_values::extract_range_from_binary_expr (value_range *vr,
 	  || (vr1.kind () == VR_ANTI_RANGE
 	      && vr1.min () == op0
 	      && vr1.min () == vr1.max ())))
-      vr->set_nonnull (expr_type);
+    {
+      vr->set_nonzero (expr_type);
+      vr->equiv_clear ();
+    }
 }
 
 /* Extract range information from a unary expression CODE OP0 based on
@@ -1085,7 +1094,8 @@ vr_values::extract_range_basic (value_range *vr, gimple *stmt)
 	      && TREE_CODE (SSA_NAME_VAR (arg)) == PARM_DECL
 	      && cfun->after_inlining)
 	    {
-	      vr->set_null (type);
+	      vr->set_zero (type);
+	      vr->equiv_clear ();
 	      return;
 	    }
 	  break;
@@ -1392,7 +1402,10 @@ vr_values::extract_range_basic (value_range *vr, gimple *stmt)
       && gimple_stmt_nonnegative_warnv_p (stmt, &sop))
     set_value_range_to_nonnegative (vr, type);
   else if (vrp_stmt_computes_nonzero (stmt))
-    vr->set_nonnull (type);
+    {
+      vr->set_nonzero (type);
+      vr->equiv_clear ();
+    }
   else
     vr->set_varying ();
 }
diff --git a/libgcc/libgcov-driver-system.c b/libgcc/libgcov-driver-system.c
index b5f3e89ebdc..0d106002098 100644
--- a/libgcc/libgcov-driver-system.c
+++ b/libgcc/libgcov-driver-system.c
@@ -262,10 +262,8 @@ static int
 gcov_exit_open_gcda_file (struct gcov_info *gi_ptr,
 			  struct gcov_filename *gf)
 {
-  const char *fname = gi_ptr->filename;
   int append_slash = 0;
-
-  fname = gi_ptr->filename;
+  const char *fname = gi_ptr->filename;
 
   /* Build relocated filename, stripping off leading
      directories from the initial filename if requested. */
diff --git a/libgcc/libgcov-util.c b/libgcc/libgcov-util.c
index ae0dd017204..e672768966b 100644
--- a/libgcc/libgcov-util.c
+++ b/libgcc/libgcov-util.c
@@ -461,10 +461,9 @@ gcov_read_profile_dir (const char* dir_name, int recompute_summary ATTRIBUTE_UNU
 #ifdef HAVE_FTW_H
   ftw (".", ftw_read_file, 50);
 #endif
-  ret = chdir (pwd);
+  chdir (pwd);
   free (pwd);
 
-
   return gcov_info_head;;
 }