gcc/x86-Fix-bf16-and-matrix.patch

This backport contains 4 patchs from gcc main stream tree.
The commit id of these patchs list as following in the order of time.

0001-re-PR-target-90424-memcpy-into-vector-builtin-not-op.patch
1bf2a0b90f2457f6d9301535560eb5e05978261b

0002-testsuite-aarch64-arm-Add-missing-quotes-to-expected.patch
0ec537f3500924f29505977aa89c2a1d4671c584

0003-x86-Tweak-testcases-for-PR82361.patch
ad4644f378fe2f731cd987a4aff14b935f530b88

0004-x86-Robustify-vzeroupper-handling-across-calls.patch
2a2e3a0dfcbe0861915f421d11b828f0c35023f0

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 9282a8fb6..ba72da1ec 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -95,6 +95,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "i386-builtins.h"
 #include "i386-expand.h"
 #include "i386-features.h"
+#include "function-abi.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -13529,6 +13530,15 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
 	    }
 	}
 
+      /* If the function is known to preserve some SSE registers,
+	 RA and previous passes can legitimately rely on that for
+	 modes wider than 256 bits.  It's only safe to issue a
+	 vzeroupper if all SSE registers are clobbered.  */
+      const function_abi &abi = insn_callee_abi (insn);
+      if (!hard_reg_set_subset_p (reg_class_contents[ALL_SSE_REGS],
+				  abi.mode_clobbers (V4DImode)))
+	return AVX_U128_ANY;
+
       return AVX_U128_CLEAN;
     }
 
diff --git a/gcc/testsuite/g++.target/i386/pr90424-1.C b/gcc/testsuite/g++.target/i386/pr90424-1.C
new file mode 100644
index 000000000..9df8c089b
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr90424-1.C
@@ -0,0 +1,32 @@
+/* { dg-do compile { target c++11 } } */
+/* { dg-options "-O2 -msse2 -fdump-tree-optimized" } */
+
+template <class T>
+using V [[gnu::vector_size(16)]] = T;
+
+template <class T, unsigned M = sizeof(V<T>)>
+V<T> load(const void *p) {
+  using W = V<T>;
+  W r;
+  __builtin_memcpy(&r, p, M);
+  return r;
+}
+
+// movq or movsd
+template V<char> load<char, 8>(const void *);     // bad
+template V<short> load<short, 8>(const void *);   // bad
+template V<int> load<int, 8>(const void *);       // bad
+template V<long> load<long, 8>(const void *);     // good
+// the following is disabled because V2SF isn't a supported mode
+// template V<float> load<float, 8>(const void *);   // bad
+template V<double> load<double, 8>(const void *); // good (movsd?)
+
+// movd or movss
+template V<char> load<char, 4>(const void *);   // bad
+template V<short> load<short, 4>(const void *); // bad
+template V<int> load<int, 4>(const void *);     // good
+template V<float> load<float, 4>(const void *); // good
+
+/* We should end up with one load and one insert for each function.  */
+/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 9 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "MEM" 9 "optimized" } } */
diff --git a/gcc/testsuite/g++.target/i386/pr90424-2.C b/gcc/testsuite/g++.target/i386/pr90424-2.C
new file mode 100644
index 000000000..3abb65f45
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr90424-2.C
@@ -0,0 +1,31 @@
+/* { dg-do compile { target c++11 } } */
+/* { dg-options "-O2 -msse2 -fdump-tree-optimized" } */
+
+template <class T>
+using V [[gnu::vector_size(16)]] = T;
+
+template <class T, unsigned M = sizeof(V<T>)>
+V<T> load(const void *p) {
+  V<T> r = {};
+  __builtin_memcpy(&r, p, M);
+  return r;
+}
+
+// movq or movsd
+template V<char> load<char, 8>(const void *);     // bad
+template V<short> load<short, 8>(const void *);   // bad
+template V<int> load<int, 8>(const void *);       // bad
+template V<long> load<long, 8>(const void *);     // good
+// the following is disabled because V2SF isn't a supported mode
+// template V<float> load<float, 8>(const void *);   // bad
+template V<double> load<double, 8>(const void *); // good (movsd?)
+
+// movd or movss
+template V<char> load<char, 4>(const void *);   // bad
+template V<short> load<short, 4>(const void *); // bad
+template V<int> load<int, 4>(const void *);     // good
+template V<float> load<float, 4>(const void *); // good
+
+/* We should end up with one load and one insert for each function.  */
+/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 9 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "MEM" 9 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/target_attr_10.c b/gcc/testsuite/gcc.target/aarch64/target_attr_10.c
index 184990471..d96a8733a 100644
--- a/gcc/testsuite/gcc.target/aarch64/target_attr_10.c
+++ b/gcc/testsuite/gcc.target/aarch64/target_attr_10.c
@@ -13,4 +13,4 @@ foo (uint8x16_t a, uint8x16_t b, uint8x16_t c)
   return vbslq_u8 (a, b, c); /* { dg-message "called from here" } */
 }
 
-/* { dg-error "inlining failed in call to always_inline" "" { target *-*-* } 0 } */
+/* { dg-error "inlining failed in call to 'always_inline'" "" { target *-*-* } 0 } */
diff --git a/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c b/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c
index 05dc579f2..fb6e0b9cd 100644
--- a/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c
+++ b/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c
@@ -14,5 +14,5 @@ foo (uint8x16_t *p)
   *p = vmovq_n_u8 (3); /* { dg-message "called from here" } */
 }
 
-/* { dg-error "inlining failed in call to always_inline" "" { target *-*-* } 0 } */
+/* { dg-error "inlining failed in call to 'always_inline'" "" { target *-*-* } 0 } */
 
diff --git a/gcc/testsuite/gcc.target/i386/pr82361-1.c b/gcc/testsuite/gcc.target/i386/pr82361-1.c
index e7c356557..dec1792ae 100644
--- a/gcc/testsuite/gcc.target/i386/pr82361-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr82361-1.c
@@ -4,50 +4,50 @@
 /* We should be able to optimize all %eax to %rax zero extensions, because
    div and idiv instructions with 32-bit operands zero-extend both results.   */
 /* { dg-final { scan-assembler-not "movl\t%eax, %eax" } } */
-/* FIXME: We are still not able to optimize the modulo in f1/f2, only manage
-   one.  */
+/* FIXME: The compiler does not merge zero-extension to the modulo part
+   of f1 and f2.  */
 /* { dg-final { scan-assembler-times "movl\t%edx" 2 } } */
 
 void
 f1 (unsigned int a, unsigned int b)
 {
-  unsigned long long c = a / b;
-  unsigned long long d = a % b;
+  register unsigned long long c asm ("rax") = a / b;
+  register unsigned long long d asm ("rdx") = a % b;
   asm volatile ("" : : "r" (c), "r" (d));
 }
 
 void
 f2 (int a, int b)
 {
-  unsigned long long c = (unsigned int) (a / b);
-  unsigned long long d = (unsigned int) (a % b);
+  register unsigned long long c asm ("rax") = (unsigned int) (a / b);
+  register unsigned long long d asm ("rdx") = (unsigned int) (a % b);
   asm volatile ("" : : "r" (c), "r" (d));
 }
 
 void
 f3 (unsigned int a, unsigned int b)
 {
-  unsigned long long c = a / b;
+  register unsigned long long c asm ("rax") = a / b;
   asm volatile ("" : : "r" (c));
 }
 
 void
 f4 (int a, int b)
 {
-  unsigned long long c = (unsigned int) (a / b);
+  register unsigned long long c asm ("rax") = (unsigned int) (a / b);
   asm volatile ("" : : "r" (c));
 }
 
 void
 f5 (unsigned int a, unsigned int b)
 {
-  unsigned long long d = a % b;
+  register unsigned long long d asm ("rdx") = a % b;
   asm volatile ("" : : "r" (d));
 }
 
 void
 f6 (int a, int b)
 {
-  unsigned long long d = (unsigned int) (a % b);
+  register unsigned long long d asm ("rdx") = (unsigned int) (a % b);
   asm volatile ("" : : "r" (d));
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr82361-2.c b/gcc/testsuite/gcc.target/i386/pr82361-2.c
index c1e484d6e..2d87de182 100644
--- a/gcc/testsuite/gcc.target/i386/pr82361-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr82361-2.c
@@ -4,7 +4,8 @@
 /* We should be able to optimize all %eax to %rax zero extensions, because
    div and idiv instructions with 32-bit operands zero-extend both results.   */
 /* { dg-final { scan-assembler-not "movl\t%eax, %eax" } } */
-/* Ditto %edx to %rdx zero extensions.  */
-/* { dg-final { scan-assembler-not "movl\t%edx, %edx" } } */
+/* FIXME: The compiler does not merge zero-extension to the modulo part
+   of f1 and f2.  */
+/* { dg-final { scan-assembler-times "movl\t%edx" 4 } } */
 
 #include "pr82361-1.c"
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index 527deffe4..be47519bc 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -4297,8 +4297,17 @@ verify_gimple_assign_ternary (gassign *stmt)
 	}
       if (! ((INTEGRAL_TYPE_P (rhs1_type)
 	      && INTEGRAL_TYPE_P (rhs2_type))
+	     /* Vector element insert.  */
 	     || (VECTOR_TYPE_P (rhs1_type)
-		 && types_compatible_p (TREE_TYPE (rhs1_type), rhs2_type))))
+		 && types_compatible_p (TREE_TYPE (rhs1_type), rhs2_type))
+	     /* Aligned sub-vector insert.  */
+	     || (VECTOR_TYPE_P (rhs1_type)
+		 && VECTOR_TYPE_P (rhs2_type)
+		 && types_compatible_p (TREE_TYPE (rhs1_type),
+					TREE_TYPE (rhs2_type))
+		 && multiple_p (TYPE_VECTOR_SUBPARTS (rhs1_type),
+				TYPE_VECTOR_SUBPARTS (rhs2_type))
+		 && multiple_of_p (bitsizetype, rhs3, TYPE_SIZE (rhs2_type)))))
 	{
 	  error ("not allowed type combination in BIT_INSERT_EXPR");
 	  debug_generic_expr (rhs1_type);
diff --git a/gcc/tree-ssa.c b/gcc/tree-ssa.c
index 1dc544b6d..a149f5e79 100644
--- a/gcc/tree-ssa.c
+++ b/gcc/tree-ssa.c
@@ -1522,8 +1522,6 @@ non_rewritable_lvalue_p (tree lhs)
       if (DECL_P (decl)
 	  && VECTOR_TYPE_P (TREE_TYPE (decl))
 	  && TYPE_MODE (TREE_TYPE (decl)) != BLKmode
-	  && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)),
-			      TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (decl))), 0)
 	  && known_ge (mem_ref_offset (lhs), 0)
 	  && known_gt (wi::to_poly_offset (TYPE_SIZE_UNIT (TREE_TYPE (decl))),
 		       mem_ref_offset (lhs))
@@ -1531,7 +1529,24 @@ non_rewritable_lvalue_p (tree lhs)
 			    TYPE_SIZE_UNIT (TREE_TYPE (lhs)))
 	  && known_ge (wi::to_poly_offset (TYPE_SIZE (TREE_TYPE (decl))),
 		       wi::to_poly_offset (TYPE_SIZE (TREE_TYPE (lhs)))))
-	return false;
+	{
+	  poly_uint64 lhs_bits, nelts;
+	  if (poly_int_tree_p (TYPE_SIZE (TREE_TYPE (lhs)), &lhs_bits)
+	      && multiple_p (lhs_bits,
+			     tree_to_uhwi
+			       (TYPE_SIZE (TREE_TYPE (TREE_TYPE (decl)))),
+			     &nelts))
+	    {
+	      if (known_eq (nelts, 1u))
+		return false;
+	      /* For sub-vector inserts the insert vector mode has to be
+		 supported.  */
+	      tree vtype = build_vector_type (TREE_TYPE (TREE_TYPE (decl)),
+					      nelts);
+	      if (TYPE_MODE (vtype) != BLKmode)
+		return false;
+	    }
+	}
     }
 
   /* A vector-insert using a BIT_FIELD_REF is rewritable using
@@ -1869,20 +1884,30 @@ execute_update_addresses_taken (void)
 		    && bitmap_bit_p (suitable_for_renaming, DECL_UID (sym))
 		    && VECTOR_TYPE_P (TREE_TYPE (sym))
 		    && TYPE_MODE (TREE_TYPE (sym)) != BLKmode
-		    && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)),
-					TYPE_SIZE_UNIT
-					  (TREE_TYPE (TREE_TYPE (sym))), 0)
-		    && tree_fits_uhwi_p (TREE_OPERAND (lhs, 1))
-		    && tree_int_cst_lt (TREE_OPERAND (lhs, 1),
-					TYPE_SIZE_UNIT (TREE_TYPE (sym)))
-		    && (tree_to_uhwi (TREE_OPERAND (lhs, 1))
-			% tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (lhs)))) == 0)
+		    && known_ge (mem_ref_offset (lhs), 0)
+		    && known_gt (wi::to_poly_offset
+				   (TYPE_SIZE_UNIT (TREE_TYPE (sym))),
+				 mem_ref_offset (lhs))
+		    && multiple_of_p (sizetype,
+				      TREE_OPERAND (lhs, 1),
+				      TYPE_SIZE_UNIT (TREE_TYPE (lhs))))
 		  {
 		    tree val = gimple_assign_rhs1 (stmt);
 		    if (! types_compatible_p (TREE_TYPE (val),
 					      TREE_TYPE (TREE_TYPE (sym))))
 		      {
-			tree tem = make_ssa_name (TREE_TYPE (TREE_TYPE (sym)));
+			poly_uint64 lhs_bits, nelts;
+			tree temtype = TREE_TYPE (TREE_TYPE (sym));
+			if (poly_int_tree_p (TYPE_SIZE (TREE_TYPE (lhs)),
+					     &lhs_bits)
+			    && multiple_p (lhs_bits,
+					   tree_to_uhwi
+					     (TYPE_SIZE (TREE_TYPE
+							   (TREE_TYPE (sym)))),
+					   &nelts)
+			    && maybe_ne (nelts, 1u))
+			  temtype = build_vector_type (temtype, nelts);
+			tree tem = make_ssa_name (temtype);
 			gimple *pun
 			  = gimple_build_assign (tem,
 						 build1 (VIEW_CONVERT_EXPR,
Upload GCC feature and bugfix patches. - avoid-cycling-on-vertain-subreg-reloads.patch: Add patch source comment - change-gcc-BASE-VER.patch: Likewise - dont-generate-IF_THEN_ELSE.patch: Likewise - fix-ICE-in-compute_live_loop_exits.patch: Likewise - fix-ICE-in-eliminate_stmt.patch: Likewise - fix-ICE-in-vect_create_epilog_for_reduction.patch: Likewise - fix-ICE-in-vect_stmt_to_vectorize.patch: Likewise - fix-ICE-in-verify_ssa.patch: Likewise - fix-ICE-when-vectorizing-nested-cycles.patch: Likewise - fix-cost-of-plus.patch: Likewise - ipa-const-prop-self-recursion-bugfix.patch: Likewise - simplify-removing-subregs.patch: Likewise - medium-code-mode.patch: Bugfix - fix-when-peeling-for-alignment.patch: Move to ... - fix-PR-92351-When-peeling-for-alignment.patch: ... this - AArch64-Fix-constraints-for-CPY-M.patch: New file - Apply-maximum-nunits-for-BB-SLP.patch: New file - Fix-EXTRACT_LAST_REDUCTION-segfault.patch: New file - Fix-up-push_partial_def-little-endian-bitfield.patch: New file - Fix-zero-masking-for-vcvtps2ph.patch: New file - IRA-Handle-fully-tied-destinations.patch: New file - SLP-VECT-Add-check-to-fix-96837.patch: New file - aarch64-Fix-ash-lr-lshr-mode-3-expanders.patch: New file - aarch64-Fix-bf16-and-matrix-g++-gfortran.patch: New file - aarch64-Fix-mismatched-SVE-predicate-modes.patch: New file - aarch64-fix-sve-acle-error.patch: New file - adjust-vector-cost-and-move-EXTRACT_LAST_REDUCTION-costing.patch: New file - bf16-and-matrix-characteristic.patch: New file - fix-ICE-IPA-compare-VRP-types.patch: New file - fix-ICE-in-affine-combination.patch: New file - fix-ICE-in-pass-vect.patch: New file - fix-ICE-in-vect_update_misalignment_for_peel.patch: New file - fix-addlosymdi-ICE-in-pass-reload.patch: New file - fix-an-ICE-in-vect_recog_mask_conversion_pattern.patch: New file - fix-avx512vl-vcvttpd2dq-2-fail.patch: New file - fix-issue499-add-nop-convert.patch: New file - fix-issue604-ldist-dependency-fixup.patch: New file - modulo-sched-Carefully-process-loop-counter-initiali.patch: New file - re-PR-target-91124-gcc.target-i386-avx512vl-vpshldvd.patch: New file - reduction-paths-with-unhandled-live-stmt.patch: New file - redundant-loop-elimination.patch: New file - sccvn-Improve-handling-of-load-masked-with-integer.patch: New file - speed-up-DDG-analysis-and-fix-bootstrap-compare-debug.patch: New file - store-merging-Consider-also-overlapping-stores-earlier.patch: New file - tree-optimization-96920-another-ICE-when-vectorizing.patch: New file - tree-optimization-97812-fix-range-query-in-VRP-asser.patch: New file - vectorizable-comparison-Swap-operands-only-once.patch: New file - x86-Fix-bf16-and-matrix.patch: New file 2020-12-30 09:54:10 +08:00			`This backport contains 4 patchs from gcc main stream tree.`
			`The commit id of these patchs list as following in the order of time.`

			`0001-re-PR-target-90424-memcpy-into-vector-builtin-not-op.patch`
			`1bf2a0b90f2457f6d9301535560eb5e05978261b`

			`0002-testsuite-aarch64-arm-Add-missing-quotes-to-expected.patch`
			`0ec537f3500924f29505977aa89c2a1d4671c584`

			`0003-x86-Tweak-testcases-for-PR82361.patch`
			`ad4644f378fe2f731cd987a4aff14b935f530b88`

			`0004-x86-Robustify-vzeroupper-handling-across-calls.patch`
			`2a2e3a0dfcbe0861915f421d11b828f0c35023f0`

			`diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c`
			`index 9282a8fb6..ba72da1ec 100644`
			`--- a/gcc/config/i386/i386.c`
			`+++ b/gcc/config/i386/i386.c`
			`@@ -95,6 +95,7 @@ along with GCC; see the file COPYING3. If not see`
			`#include "i386-builtins.h"`
			`#include "i386-expand.h"`
			`#include "i386-features.h"`
			`+#include "function-abi.h"`

			`/* This file should be included last. */`
			`#include "target-def.h"`
			`@@ -13529,6 +13530,15 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)`
			`}`
			`}`

			`+ /* If the function is known to preserve some SSE registers,`
			`+ RA and previous passes can legitimately rely on that for`
			`+ modes wider than 256 bits. It's only safe to issue a`
			`+ vzeroupper if all SSE registers are clobbered. */`
			`+ const function_abi &abi = insn_callee_abi (insn);`
			`+ if (!hard_reg_set_subset_p (reg_class_contents[ALL_SSE_REGS],`
			`+ abi.mode_clobbers (V4DImode)))`
			`+ return AVX_U128_ANY;`
			`+`
			`return AVX_U128_CLEAN;`
			`}`

			`diff --git a/gcc/testsuite/g++.target/i386/pr90424-1.C b/gcc/testsuite/g++.target/i386/pr90424-1.C`
			`new file mode 100644`
			`index 000000000..9df8c089b`
			`--- /dev/null`
			`+++ b/gcc/testsuite/g++.target/i386/pr90424-1.C`
			`@@ -0,0 +1,32 @@`
			`+/* { dg-do compile { target c++11 } } */`
			`+/* { dg-options "-O2 -msse2 -fdump-tree-optimized" } */`
			`+`
			`+template <class T>`
			`+using V [[gnu::vector_size(16)]] = T;`
			`+`
			`+template <class T, unsigned M = sizeof(V<T>)>`
			`+V<T> load(const void *p) {`
			`+ using W = V<T>;`
			`+ W r;`
			`+ __builtin_memcpy(&r, p, M);`
			`+ return r;`
			`+}`
			`+`
			`+// movq or movsd`
			`+template V<char> load<char, 8>(const void *); // bad`
			`+template V<short> load<short, 8>(const void *); // bad`
			`+template V<int> load<int, 8>(const void *); // bad`
			`+template V<long> load<long, 8>(const void *); // good`
			`+// the following is disabled because V2SF isn't a supported mode`
			`+// template V<float> load<float, 8>(const void *); // bad`
			`+template V<double> load<double, 8>(const void *); // good (movsd?)`
			`+`
			`+// movd or movss`
			`+template V<char> load<char, 4>(const void *); // bad`
			`+template V<short> load<short, 4>(const void *); // bad`
			`+template V<int> load<int, 4>(const void *); // good`
			`+template V<float> load<float, 4>(const void *); // good`
			`+`
			`+/* We should end up with one load and one insert for each function. */`
			`+/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 9 "optimized" } } */`
			`+/* { dg-final { scan-tree-dump-times "MEM" 9 "optimized" } } */`
			`diff --git a/gcc/testsuite/g++.target/i386/pr90424-2.C b/gcc/testsuite/g++.target/i386/pr90424-2.C`
			`new file mode 100644`
			`index 000000000..3abb65f45`
			`--- /dev/null`
			`+++ b/gcc/testsuite/g++.target/i386/pr90424-2.C`
			`@@ -0,0 +1,31 @@`
			`+/* { dg-do compile { target c++11 } } */`
			`+/* { dg-options "-O2 -msse2 -fdump-tree-optimized" } */`
			`+`
			`+template <class T>`
			`+using V [[gnu::vector_size(16)]] = T;`
			`+`
			`+template <class T, unsigned M = sizeof(V<T>)>`
			`+V<T> load(const void *p) {`
			`+ V<T> r = {};`
			`+ __builtin_memcpy(&r, p, M);`
			`+ return r;`
			`+}`
			`+`
			`+// movq or movsd`
			`+template V<char> load<char, 8>(const void *); // bad`
			`+template V<short> load<short, 8>(const void *); // bad`
			`+template V<int> load<int, 8>(const void *); // bad`
			`+template V<long> load<long, 8>(const void *); // good`
			`+// the following is disabled because V2SF isn't a supported mode`
			`+// template V<float> load<float, 8>(const void *); // bad`
			`+template V<double> load<double, 8>(const void *); // good (movsd?)`
			`+`
			`+// movd or movss`
			`+template V<char> load<char, 4>(const void *); // bad`
			`+template V<short> load<short, 4>(const void *); // bad`
			`+template V<int> load<int, 4>(const void *); // good`
			`+template V<float> load<float, 4>(const void *); // good`
			`+`
			`+/* We should end up with one load and one insert for each function. */`
			`+/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 9 "optimized" } } */`
			`+/* { dg-final { scan-tree-dump-times "MEM" 9 "optimized" } } */`
			`diff --git a/gcc/testsuite/gcc.target/aarch64/target_attr_10.c b/gcc/testsuite/gcc.target/aarch64/target_attr_10.c`
			`index 184990471..d96a8733a 100644`
			`--- a/gcc/testsuite/gcc.target/aarch64/target_attr_10.c`
			`+++ b/gcc/testsuite/gcc.target/aarch64/target_attr_10.c`
			`@@ -13,4 +13,4 @@ foo (uint8x16_t a, uint8x16_t b, uint8x16_t c)`
			`return vbslq_u8 (a, b, c); /* { dg-message "called from here" } */`
			`}`

			`-/* { dg-error "inlining failed in call to always_inline" "" { target --* } 0 } */`
			`+/* { dg-error "inlining failed in call to 'always_inline'" "" { target --* } 0 } */`
			`diff --git a/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c b/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c`
			`index 05dc579f2..fb6e0b9cd 100644`
			`--- a/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c`
			`+++ b/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c`
			`@@ -14,5 +14,5 @@ foo (uint8x16_t *p)`
			`p = vmovq_n_u8 (3); / { dg-message "called from here" } */`
			`}`

			`-/* { dg-error "inlining failed in call to always_inline" "" { target --* } 0 } */`
			`+/* { dg-error "inlining failed in call to 'always_inline'" "" { target --* } 0 } */`

			`diff --git a/gcc/testsuite/gcc.target/i386/pr82361-1.c b/gcc/testsuite/gcc.target/i386/pr82361-1.c`
			`index e7c356557..dec1792ae 100644`
			`--- a/gcc/testsuite/gcc.target/i386/pr82361-1.c`
			`+++ b/gcc/testsuite/gcc.target/i386/pr82361-1.c`
			`@@ -4,50 +4,50 @@`
			`/* We should be able to optimize all %eax to %rax zero extensions, because`
			`div and idiv instructions with 32-bit operands zero-extend both results. */`
			`/* { dg-final { scan-assembler-not "movl\t%eax, %eax" } } */`
			`-/* FIXME: We are still not able to optimize the modulo in f1/f2, only manage`
			`- one. */`
			`+/* FIXME: The compiler does not merge zero-extension to the modulo part`
			`+ of f1 and f2. */`
			`/* { dg-final { scan-assembler-times "movl\t%edx" 2 } } */`

			`void`
			`f1 (unsigned int a, unsigned int b)`
			`{`
			`- unsigned long long c = a / b;`
			`- unsigned long long d = a % b;`
			`+ register unsigned long long c asm ("rax") = a / b;`
			`+ register unsigned long long d asm ("rdx") = a % b;`
			`asm volatile ("" : : "r" (c), "r" (d));`
			`}`

			`void`
			`f2 (int a, int b)`
			`{`
			`- unsigned long long c = (unsigned int) (a / b);`
			`- unsigned long long d = (unsigned int) (a % b);`
			`+ register unsigned long long c asm ("rax") = (unsigned int) (a / b);`
			`+ register unsigned long long d asm ("rdx") = (unsigned int) (a % b);`
			`asm volatile ("" : : "r" (c), "r" (d));`
			`}`

			`void`
			`f3 (unsigned int a, unsigned int b)`
			`{`
			`- unsigned long long c = a / b;`
			`+ register unsigned long long c asm ("rax") = a / b;`
			`asm volatile ("" : : "r" (c));`
			`}`

			`void`
			`f4 (int a, int b)`
			`{`
			`- unsigned long long c = (unsigned int) (a / b);`
			`+ register unsigned long long c asm ("rax") = (unsigned int) (a / b);`
			`asm volatile ("" : : "r" (c));`
			`}`

			`void`
			`f5 (unsigned int a, unsigned int b)`
			`{`
			`- unsigned long long d = a % b;`
			`+ register unsigned long long d asm ("rdx") = a % b;`
			`asm volatile ("" : : "r" (d));`
			`}`

			`void`
			`f6 (int a, int b)`
			`{`
			`- unsigned long long d = (unsigned int) (a % b);`
			`+ register unsigned long long d asm ("rdx") = (unsigned int) (a % b);`
			`asm volatile ("" : : "r" (d));`
			`}`
			`diff --git a/gcc/testsuite/gcc.target/i386/pr82361-2.c b/gcc/testsuite/gcc.target/i386/pr82361-2.c`
			`index c1e484d6e..2d87de182 100644`
			`--- a/gcc/testsuite/gcc.target/i386/pr82361-2.c`
			`+++ b/gcc/testsuite/gcc.target/i386/pr82361-2.c`
			`@@ -4,7 +4,8 @@`
			`/* We should be able to optimize all %eax to %rax zero extensions, because`
			`div and idiv instructions with 32-bit operands zero-extend both results. */`
			`/* { dg-final { scan-assembler-not "movl\t%eax, %eax" } } */`
			`-/* Ditto %edx to %rdx zero extensions. */`
			`-/* { dg-final { scan-assembler-not "movl\t%edx, %edx" } } */`
			`+/* FIXME: The compiler does not merge zero-extension to the modulo part`
			`+ of f1 and f2. */`
			`+/* { dg-final { scan-assembler-times "movl\t%edx" 4 } } */`

			`#include "pr82361-1.c"`
			`diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c`
			`index 527deffe4..be47519bc 100644`
			`--- a/gcc/tree-cfg.c`
			`+++ b/gcc/tree-cfg.c`
			`@@ -4297,8 +4297,17 @@ verify_gimple_assign_ternary (gassign *stmt)`
			`}`
			`if (! ((INTEGRAL_TYPE_P (rhs1_type)`
			`&& INTEGRAL_TYPE_P (rhs2_type))`
			`+ /* Vector element insert. */`
			`\|\| (VECTOR_TYPE_P (rhs1_type)`
			`- && types_compatible_p (TREE_TYPE (rhs1_type), rhs2_type))))`
			`+ && types_compatible_p (TREE_TYPE (rhs1_type), rhs2_type))`
			`+ /* Aligned sub-vector insert. */`
			`+ \|\| (VECTOR_TYPE_P (rhs1_type)`
			`+ && VECTOR_TYPE_P (rhs2_type)`
			`+ && types_compatible_p (TREE_TYPE (rhs1_type),`
			`+ TREE_TYPE (rhs2_type))`
			`+ && multiple_p (TYPE_VECTOR_SUBPARTS (rhs1_type),`
			`+ TYPE_VECTOR_SUBPARTS (rhs2_type))`
			`+ && multiple_of_p (bitsizetype, rhs3, TYPE_SIZE (rhs2_type)))))`
			`{`
			`error ("not allowed type combination in BIT_INSERT_EXPR");`
			`debug_generic_expr (rhs1_type);`
			`diff --git a/gcc/tree-ssa.c b/gcc/tree-ssa.c`
			`index 1dc544b6d..a149f5e79 100644`
			`--- a/gcc/tree-ssa.c`
			`+++ b/gcc/tree-ssa.c`
			`@@ -1522,8 +1522,6 @@ non_rewritable_lvalue_p (tree lhs)`
			`if (DECL_P (decl)`
			`&& VECTOR_TYPE_P (TREE_TYPE (decl))`
			`&& TYPE_MODE (TREE_TYPE (decl)) != BLKmode`
			`- && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)),`
			`- TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (decl))), 0)`
			`&& known_ge (mem_ref_offset (lhs), 0)`
			`&& known_gt (wi::to_poly_offset (TYPE_SIZE_UNIT (TREE_TYPE (decl))),`
			`mem_ref_offset (lhs))`
			`@@ -1531,7 +1529,24 @@ non_rewritable_lvalue_p (tree lhs)`
			`TYPE_SIZE_UNIT (TREE_TYPE (lhs)))`
			`&& known_ge (wi::to_poly_offset (TYPE_SIZE (TREE_TYPE (decl))),`
			`wi::to_poly_offset (TYPE_SIZE (TREE_TYPE (lhs)))))`
			`- return false;`
			`+ {`
			`+ poly_uint64 lhs_bits, nelts;`
			`+ if (poly_int_tree_p (TYPE_SIZE (TREE_TYPE (lhs)), &lhs_bits)`
			`+ && multiple_p (lhs_bits,`
			`+ tree_to_uhwi`
			`+ (TYPE_SIZE (TREE_TYPE (TREE_TYPE (decl)))),`
			`+ &nelts))`
			`+ {`
			`+ if (known_eq (nelts, 1u))`
			`+ return false;`
			`+ /* For sub-vector inserts the insert vector mode has to be`
			`+ supported. */`
			`+ tree vtype = build_vector_type (TREE_TYPE (TREE_TYPE (decl)),`
			`+ nelts);`
			`+ if (TYPE_MODE (vtype) != BLKmode)`
			`+ return false;`
			`+ }`
			`+ }`
			`}`

			`/* A vector-insert using a BIT_FIELD_REF is rewritable using`
			`@@ -1869,20 +1884,30 @@ execute_update_addresses_taken (void)`
			`&& bitmap_bit_p (suitable_for_renaming, DECL_UID (sym))`
			`&& VECTOR_TYPE_P (TREE_TYPE (sym))`
			`&& TYPE_MODE (TREE_TYPE (sym)) != BLKmode`
			`- && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)),`
			`- TYPE_SIZE_UNIT`
			`- (TREE_TYPE (TREE_TYPE (sym))), 0)`
			`- && tree_fits_uhwi_p (TREE_OPERAND (lhs, 1))`
			`- && tree_int_cst_lt (TREE_OPERAND (lhs, 1),`
			`- TYPE_SIZE_UNIT (TREE_TYPE (sym)))`
			`- && (tree_to_uhwi (TREE_OPERAND (lhs, 1))`
			`- % tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (lhs)))) == 0)`
			`+ && known_ge (mem_ref_offset (lhs), 0)`
			`+ && known_gt (wi::to_poly_offset`
			`+ (TYPE_SIZE_UNIT (TREE_TYPE (sym))),`
			`+ mem_ref_offset (lhs))`
			`+ && multiple_of_p (sizetype,`
			`+ TREE_OPERAND (lhs, 1),`
			`+ TYPE_SIZE_UNIT (TREE_TYPE (lhs))))`
			`{`
			`tree val = gimple_assign_rhs1 (stmt);`
			`if (! types_compatible_p (TREE_TYPE (val),`
			`TREE_TYPE (TREE_TYPE (sym))))`
			`{`
			`- tree tem = make_ssa_name (TREE_TYPE (TREE_TYPE (sym)));`
			`+ poly_uint64 lhs_bits, nelts;`
			`+ tree temtype = TREE_TYPE (TREE_TYPE (sym));`
			`+ if (poly_int_tree_p (TYPE_SIZE (TREE_TYPE (lhs)),`
			`+ &lhs_bits)`
			`+ && multiple_p (lhs_bits,`
			`+ tree_to_uhwi`
			`+ (TYPE_SIZE (TREE_TYPE`
			`+ (TREE_TYPE (sym)))),`
			`+ &nelts)`
			`+ && maybe_ne (nelts, 1u))`
			`+ temtype = build_vector_type (temtype, nelts);`
			`+ tree tem = make_ssa_name (temtype);`
			`gimple *pun`
			`= gimple_build_assign (tem,`
			`build1 (VIEW_CONVERT_EXPR,`