gcc/x86-Fix-bf16-and-matrix.patch

This backport contains 4 patchs from gcc main stream tree.
The commit id of these patchs list as following in the order of time.

0001-re-PR-target-90424-memcpy-into-vector-builtin-not-op.patch
1bf2a0b90f2457f6d9301535560eb5e05978261b

0002-testsuite-aarch64-arm-Add-missing-quotes-to-expected.patch
0ec537f3500924f29505977aa89c2a1d4671c584

0003-x86-Tweak-testcases-for-PR82361.patch
ad4644f378fe2f731cd987a4aff14b935f530b88

0004-x86-Robustify-vzeroupper-handling-across-calls.patch
2a2e3a0dfcbe0861915f421d11b828f0c35023f0

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 9282a8fb6..ba72da1ec 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -95,6 +95,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "i386-builtins.h"
 #include "i386-expand.h"
 #include "i386-features.h"
+#include "function-abi.h"

 /* This file should be included last.  */
 #include "target-def.h"
@@ -13529,6 +13530,15 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
 	    }
 	}

+      /* If the function is known to preserve some SSE registers,
+	 RA and previous passes can legitimately rely on that for
+	 modes wider than 256 bits.  It's only safe to issue a
+	 vzeroupper if all SSE registers are clobbered.  */
+      const function_abi &abi = insn_callee_abi (insn);
+      if (!hard_reg_set_subset_p (reg_class_contents[ALL_SSE_REGS],
+				  abi.mode_clobbers (V4DImode)))
+	return AVX_U128_ANY;
+
       return AVX_U128_CLEAN;
     }

diff --git a/gcc/testsuite/g++.target/i386/pr90424-1.C b/gcc/testsuite/g++.target/i386/pr90424-1.C
new file mode 100644
index 000000000..9df8c089b
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr90424-1.C
@@ -0,0 +1,32 @@
+/* { dg-do compile { target c++11 } } */
+/* { dg-options "-O2 -msse2 -fdump-tree-optimized" } */
+
+template <class T>
+using V [[gnu::vector_size(16)]] = T;
+
+template <class T, unsigned M = sizeof(V<T>)>
+V<T> load(const void *p) {
+  using W = V<T>;
+  W r;
+  __builtin_memcpy(&r, p, M);
+  return r;
+}
+
+// movq or movsd
+template V<char> load<char, 8>(const void *);     // bad
+template V<short> load<short, 8>(const void *);   // bad
+template V<int> load<int, 8>(const void *);       // bad
+template V<long> load<long, 8>(const void *);     // good
+// the following is disabled because V2SF isn't a supported mode
+// template V<float> load<float, 8>(const void *);   // bad
+template V<double> load<double, 8>(const void *); // good (movsd?)
+
+// movd or movss
+template V<char> load<char, 4>(const void *);   // bad
+template V<short> load<short, 4>(const void *); // bad
+template V<int> load<int, 4>(const void *);     // good
+template V<float> load<float, 4>(const void *); // good
+
+/* We should end up with one load and one insert for each function.  */
+/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 9 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "MEM" 9 "optimized" } } */
diff --git a/gcc/testsuite/g++.target/i386/pr90424-2.C b/gcc/testsuite/g++.target/i386/pr90424-2.C
new file mode 100644
index 000000000..3abb65f45
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr90424-2.C
@@ -0,0 +1,31 @@
+/* { dg-do compile { target c++11 } } */
+/* { dg-options "-O2 -msse2 -fdump-tree-optimized" } */
+
+template <class T>
+using V [[gnu::vector_size(16)]] = T;
+
+template <class T, unsigned M = sizeof(V<T>)>
+V<T> load(const void *p) {
+  V<T> r = {};
+  __builtin_memcpy(&r, p, M);
+  return r;
+}
+
+// movq or movsd
+template V<char> load<char, 8>(const void *);     // bad
+template V<short> load<short, 8>(const void *);   // bad
+template V<int> load<int, 8>(const void *);       // bad
+template V<long> load<long, 8>(const void *);     // good
+// the following is disabled because V2SF isn't a supported mode
+// template V<float> load<float, 8>(const void *);   // bad
+template V<double> load<double, 8>(const void *); // good (movsd?)
+
+// movd or movss
+template V<char> load<char, 4>(const void *);   // bad
+template V<short> load<short, 4>(const void *); // bad
+template V<int> load<int, 4>(const void *);     // good
+template V<float> load<float, 4>(const void *); // good
+
+/* We should end up with one load and one insert for each function.  */
+/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 9 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "MEM" 9 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/target_attr_10.c b/gcc/testsuite/gcc.target/aarch64/target_attr_10.c
index 184990471..d96a8733a 100644
--- a/gcc/testsuite/gcc.target/aarch64/target_attr_10.c
+++ b/gcc/testsuite/gcc.target/aarch64/target_attr_10.c
@@ -13,4 +13,4 @@ foo (uint8x16_t a, uint8x16_t b, uint8x16_t c)
   return vbslq_u8 (a, b, c); /* { dg-message "called from here" } */
 }

-/* { dg-error "inlining failed in call to always_inline" "" { target *-*-* } 0 } */
+/* { dg-error "inlining failed in call to 'always_inline'" "" { target *-*-* } 0 } */
diff --git a/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c b/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c
index 05dc579f2..fb6e0b9cd 100644
--- a/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c
+++ b/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c
@@ -14,5 +14,5 @@ foo (uint8x16_t *p)
   *p = vmovq_n_u8 (3); /* { dg-message "called from here" } */
 }

-/* { dg-error "inlining failed in call to always_inline" "" { target *-*-* } 0 } */
+/* { dg-error "inlining failed in call to 'always_inline'" "" { target *-*-* } 0 } */

diff --git a/gcc/testsuite/gcc.target/i386/pr82361-1.c b/gcc/testsuite/gcc.target/i386/pr82361-1.c
index e7c356557..dec1792ae 100644
--- a/gcc/testsuite/gcc.target/i386/pr82361-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr82361-1.c
@@ -4,50 +4,50 @@
 /* We should be able to optimize all %eax to %rax zero extensions, because
    div and idiv instructions with 32-bit operands zero-extend both results.   */
 /* { dg-final { scan-assembler-not "movl\t%eax, %eax" } } */
-/* FIXME: We are still not able to optimize the modulo in f1/f2, only manage
-   one.  */
+/* FIXME: The compiler does not merge zero-extension to the modulo part
+   of f1 and f2.  */
 /* { dg-final { scan-assembler-times "movl\t%edx" 2 } } */

 void
 f1 (unsigned int a, unsigned int b)
 {
-  unsigned long long c = a / b;
-  unsigned long long d = a % b;
+  register unsigned long long c asm ("rax") = a / b;
+  register unsigned long long d asm ("rdx") = a % b;
   asm volatile ("" : : "r" (c), "r" (d));
 }

 void
 f2 (int a, int b)
 {
-  unsigned long long c = (unsigned int) (a / b);
-  unsigned long long d = (unsigned int) (a % b);
+  register unsigned long long c asm ("rax") = (unsigned int) (a / b);
+  register unsigned long long d asm ("rdx") = (unsigned int) (a % b);
   asm volatile ("" : : "r" (c), "r" (d));
 }

 void
 f3 (unsigned int a, unsigned int b)
 {
-  unsigned long long c = a / b;
+  register unsigned long long c asm ("rax") = a / b;
   asm volatile ("" : : "r" (c));
 }

 void
 f4 (int a, int b)
 {
-  unsigned long long c = (unsigned int) (a / b);
+  register unsigned long long c asm ("rax") = (unsigned int) (a / b);
   asm volatile ("" : : "r" (c));
 }

 void
 f5 (unsigned int a, unsigned int b)
 {
-  unsigned long long d = a % b;
+  register unsigned long long d asm ("rdx") = a % b;
   asm volatile ("" : : "r" (d));
 }

 void
 f6 (int a, int b)
 {
-  unsigned long long d = (unsigned int) (a % b);
+  register unsigned long long d asm ("rdx") = (unsigned int) (a % b);
   asm volatile ("" : : "r" (d));
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr82361-2.c b/gcc/testsuite/gcc.target/i386/pr82361-2.c
index c1e484d6e..2d87de182 100644
--- a/gcc/testsuite/gcc.target/i386/pr82361-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr82361-2.c
@@ -4,7 +4,8 @@
 /* We should be able to optimize all %eax to %rax zero extensions, because
    div and idiv instructions with 32-bit operands zero-extend both results.   */
 /* { dg-final { scan-assembler-not "movl\t%eax, %eax" } } */
-/* Ditto %edx to %rdx zero extensions.  */
-/* { dg-final { scan-assembler-not "movl\t%edx, %edx" } } */
+/* FIXME: The compiler does not merge zero-extension to the modulo part
+   of f1 and f2.  */
+/* { dg-final { scan-assembler-times "movl\t%edx" 4 } } */

 #include "pr82361-1.c"
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index 527deffe4..be47519bc 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -4297,8 +4297,17 @@ verify_gimple_assign_ternary (gassign *stmt)
 	}
       if (! ((INTEGRAL_TYPE_P (rhs1_type)
 	      && INTEGRAL_TYPE_P (rhs2_type))
+	     /* Vector element insert.  */
 	     || (VECTOR_TYPE_P (rhs1_type)
-		 && types_compatible_p (TREE_TYPE (rhs1_type), rhs2_type))))
+		 && types_compatible_p (TREE_TYPE (rhs1_type), rhs2_type))
+	     /* Aligned sub-vector insert.  */
+	     || (VECTOR_TYPE_P (rhs1_type)
+		 && VECTOR_TYPE_P (rhs2_type)
+		 && types_compatible_p (TREE_TYPE (rhs1_type),
+					TREE_TYPE (rhs2_type))
+		 && multiple_p (TYPE_VECTOR_SUBPARTS (rhs1_type),
+				TYPE_VECTOR_SUBPARTS (rhs2_type))
+		 && multiple_of_p (bitsizetype, rhs3, TYPE_SIZE (rhs2_type)))))
 	{
 	  error ("not allowed type combination in BIT_INSERT_EXPR");
 	  debug_generic_expr (rhs1_type);
diff --git a/gcc/tree-ssa.c b/gcc/tree-ssa.c
index 1dc544b6d..a149f5e79 100644
--- a/gcc/tree-ssa.c
+++ b/gcc/tree-ssa.c
@@ -1522,8 +1522,6 @@ non_rewritable_lvalue_p (tree lhs)
       if (DECL_P (decl)
 	  && VECTOR_TYPE_P (TREE_TYPE (decl))
 	  && TYPE_MODE (TREE_TYPE (decl)) != BLKmode
-	  && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)),
-			      TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (decl))), 0)
 	  && known_ge (mem_ref_offset (lhs), 0)
 	  && known_gt (wi::to_poly_offset (TYPE_SIZE_UNIT (TREE_TYPE (decl))),
 		       mem_ref_offset (lhs))
@@ -1531,7 +1529,24 @@ non_rewritable_lvalue_p (tree lhs)
 			    TYPE_SIZE_UNIT (TREE_TYPE (lhs)))
 	  && known_ge (wi::to_poly_offset (TYPE_SIZE (TREE_TYPE (decl))),
 		       wi::to_poly_offset (TYPE_SIZE (TREE_TYPE (lhs)))))
-	return false;
+	{
+	  poly_uint64 lhs_bits, nelts;
+	  if (poly_int_tree_p (TYPE_SIZE (TREE_TYPE (lhs)), &lhs_bits)
+	      && multiple_p (lhs_bits,
+			     tree_to_uhwi
+			       (TYPE_SIZE (TREE_TYPE (TREE_TYPE (decl)))),
+			     &nelts))
+	    {
+	      if (known_eq (nelts, 1u))
+		return false;
+	      /* For sub-vector inserts the insert vector mode has to be
+		 supported.  */
+	      tree vtype = build_vector_type (TREE_TYPE (TREE_TYPE (decl)),
+					      nelts);
+	      if (TYPE_MODE (vtype) != BLKmode)
+		return false;
+	    }
+	}
     }

   /* A vector-insert using a BIT_FIELD_REF is rewritable using
@@ -1869,20 +1884,30 @@ execute_update_addresses_taken (void)
 		    && bitmap_bit_p (suitable_for_renaming, DECL_UID (sym))
 		    && VECTOR_TYPE_P (TREE_TYPE (sym))
 		    && TYPE_MODE (TREE_TYPE (sym)) != BLKmode
-		    && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)),
-					TYPE_SIZE_UNIT
-					  (TREE_TYPE (TREE_TYPE (sym))), 0)
-		    && tree_fits_uhwi_p (TREE_OPERAND (lhs, 1))
-		    && tree_int_cst_lt (TREE_OPERAND (lhs, 1),
-					TYPE_SIZE_UNIT (TREE_TYPE (sym)))
-		    && (tree_to_uhwi (TREE_OPERAND (lhs, 1))
-			% tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (lhs)))) == 0)
+		    && known_ge (mem_ref_offset (lhs), 0)
+		    && known_gt (wi::to_poly_offset
+				   (TYPE_SIZE_UNIT (TREE_TYPE (sym))),
+				 mem_ref_offset (lhs))
+		    && multiple_of_p (sizetype,
+				      TREE_OPERAND (lhs, 1),
+				      TYPE_SIZE_UNIT (TREE_TYPE (lhs))))
 		  {
 		    tree val = gimple_assign_rhs1 (stmt);
 		    if (! types_compatible_p (TREE_TYPE (val),
 					      TREE_TYPE (TREE_TYPE (sym))))
 		      {
-			tree tem = make_ssa_name (TREE_TYPE (TREE_TYPE (sym)));
+			poly_uint64 lhs_bits, nelts;
+			tree temtype = TREE_TYPE (TREE_TYPE (sym));
+			if (poly_int_tree_p (TYPE_SIZE (TREE_TYPE (lhs)),
+					     &lhs_bits)
+			    && multiple_p (lhs_bits,
+					   tree_to_uhwi
+					     (TYPE_SIZE (TREE_TYPE
+							   (TREE_TYPE (sym)))),
+					   &nelts)
+			    && maybe_ne (nelts, 1u))
+			  temtype = build_vector_type (temtype, nelts);
+			tree tem = make_ssa_name (temtype);
 			gimple *pun
 			  = gimple_build_assign (tem,
 						 build1 (VIEW_CONVERT_EXPR,