322 lines
12 KiB
Diff
322 lines
12 KiB
Diff
|
|
This backport contains 4 patchs from gcc main stream tree.
|
||
|
|
The commit id of these patchs list as following in the order of time.
|
||
|
|
|
||
|
|
0001-re-PR-target-90424-memcpy-into-vector-builtin-not-op.patch
|
||
|
|
1bf2a0b90f2457f6d9301535560eb5e05978261b
|
||
|
|
|
||
|
|
0002-testsuite-aarch64-arm-Add-missing-quotes-to-expected.patch
|
||
|
|
0ec537f3500924f29505977aa89c2a1d4671c584
|
||
|
|
|
||
|
|
0003-x86-Tweak-testcases-for-PR82361.patch
|
||
|
|
ad4644f378fe2f731cd987a4aff14b935f530b88
|
||
|
|
|
||
|
|
0004-x86-Robustify-vzeroupper-handling-across-calls.patch
|
||
|
|
2a2e3a0dfcbe0861915f421d11b828f0c35023f0
|
||
|
|
|
||
|
|
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
|
||
|
|
index 9282a8fb6..ba72da1ec 100644
|
||
|
|
--- a/gcc/config/i386/i386.c
|
||
|
|
+++ b/gcc/config/i386/i386.c
|
||
|
|
@@ -95,6 +95,7 @@ along with GCC; see the file COPYING3. If not see
|
||
|
|
#include "i386-builtins.h"
|
||
|
|
#include "i386-expand.h"
|
||
|
|
#include "i386-features.h"
|
||
|
|
+#include "function-abi.h"
|
||
|
|
|
||
|
|
/* This file should be included last. */
|
||
|
|
#include "target-def.h"
|
||
|
|
@@ -13529,6 +13530,15 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
+ /* If the function is known to preserve some SSE registers,
|
||
|
|
+ RA and previous passes can legitimately rely on that for
|
||
|
|
+ modes wider than 256 bits. It's only safe to issue a
|
||
|
|
+ vzeroupper if all SSE registers are clobbered. */
|
||
|
|
+ const function_abi &abi = insn_callee_abi (insn);
|
||
|
|
+ if (!hard_reg_set_subset_p (reg_class_contents[ALL_SSE_REGS],
|
||
|
|
+ abi.mode_clobbers (V4DImode)))
|
||
|
|
+ return AVX_U128_ANY;
|
||
|
|
+
|
||
|
|
return AVX_U128_CLEAN;
|
||
|
|
}
|
||
|
|
|
||
|
|
diff --git a/gcc/testsuite/g++.target/i386/pr90424-1.C b/gcc/testsuite/g++.target/i386/pr90424-1.C
|
||
|
|
new file mode 100644
|
||
|
|
index 000000000..9df8c089b
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/gcc/testsuite/g++.target/i386/pr90424-1.C
|
||
|
|
@@ -0,0 +1,32 @@
|
||
|
|
+/* { dg-do compile { target c++11 } } */
|
||
|
|
+/* { dg-options "-O2 -msse2 -fdump-tree-optimized" } */
|
||
|
|
+
|
||
|
|
+template <class T>
|
||
|
|
+using V [[gnu::vector_size(16)]] = T;
|
||
|
|
+
|
||
|
|
+template <class T, unsigned M = sizeof(V<T>)>
|
||
|
|
+V<T> load(const void *p) {
|
||
|
|
+ using W = V<T>;
|
||
|
|
+ W r;
|
||
|
|
+ __builtin_memcpy(&r, p, M);
|
||
|
|
+ return r;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+// movq or movsd
|
||
|
|
+template V<char> load<char, 8>(const void *); // bad
|
||
|
|
+template V<short> load<short, 8>(const void *); // bad
|
||
|
|
+template V<int> load<int, 8>(const void *); // bad
|
||
|
|
+template V<long> load<long, 8>(const void *); // good
|
||
|
|
+// the following is disabled because V2SF isn't a supported mode
|
||
|
|
+// template V<float> load<float, 8>(const void *); // bad
|
||
|
|
+template V<double> load<double, 8>(const void *); // good (movsd?)
|
||
|
|
+
|
||
|
|
+// movd or movss
|
||
|
|
+template V<char> load<char, 4>(const void *); // bad
|
||
|
|
+template V<short> load<short, 4>(const void *); // bad
|
||
|
|
+template V<int> load<int, 4>(const void *); // good
|
||
|
|
+template V<float> load<float, 4>(const void *); // good
|
||
|
|
+
|
||
|
|
+/* We should end up with one load and one insert for each function. */
|
||
|
|
+/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 9 "optimized" } } */
|
||
|
|
+/* { dg-final { scan-tree-dump-times "MEM" 9 "optimized" } } */
|
||
|
|
diff --git a/gcc/testsuite/g++.target/i386/pr90424-2.C b/gcc/testsuite/g++.target/i386/pr90424-2.C
|
||
|
|
new file mode 100644
|
||
|
|
index 000000000..3abb65f45
|
||
|
|
--- /dev/null
|
||
|
|
+++ b/gcc/testsuite/g++.target/i386/pr90424-2.C
|
||
|
|
@@ -0,0 +1,31 @@
|
||
|
|
+/* { dg-do compile { target c++11 } } */
|
||
|
|
+/* { dg-options "-O2 -msse2 -fdump-tree-optimized" } */
|
||
|
|
+
|
||
|
|
+template <class T>
|
||
|
|
+using V [[gnu::vector_size(16)]] = T;
|
||
|
|
+
|
||
|
|
+template <class T, unsigned M = sizeof(V<T>)>
|
||
|
|
+V<T> load(const void *p) {
|
||
|
|
+ V<T> r = {};
|
||
|
|
+ __builtin_memcpy(&r, p, M);
|
||
|
|
+ return r;
|
||
|
|
+}
|
||
|
|
+
|
||
|
|
+// movq or movsd
|
||
|
|
+template V<char> load<char, 8>(const void *); // bad
|
||
|
|
+template V<short> load<short, 8>(const void *); // bad
|
||
|
|
+template V<int> load<int, 8>(const void *); // bad
|
||
|
|
+template V<long> load<long, 8>(const void *); // good
|
||
|
|
+// the following is disabled because V2SF isn't a supported mode
|
||
|
|
+// template V<float> load<float, 8>(const void *); // bad
|
||
|
|
+template V<double> load<double, 8>(const void *); // good (movsd?)
|
||
|
|
+
|
||
|
|
+// movd or movss
|
||
|
|
+template V<char> load<char, 4>(const void *); // bad
|
||
|
|
+template V<short> load<short, 4>(const void *); // bad
|
||
|
|
+template V<int> load<int, 4>(const void *); // good
|
||
|
|
+template V<float> load<float, 4>(const void *); // good
|
||
|
|
+
|
||
|
|
+/* We should end up with one load and one insert for each function. */
|
||
|
|
+/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 9 "optimized" } } */
|
||
|
|
+/* { dg-final { scan-tree-dump-times "MEM" 9 "optimized" } } */
|
||
|
|
diff --git a/gcc/testsuite/gcc.target/aarch64/target_attr_10.c b/gcc/testsuite/gcc.target/aarch64/target_attr_10.c
|
||
|
|
index 184990471..d96a8733a 100644
|
||
|
|
--- a/gcc/testsuite/gcc.target/aarch64/target_attr_10.c
|
||
|
|
+++ b/gcc/testsuite/gcc.target/aarch64/target_attr_10.c
|
||
|
|
@@ -13,4 +13,4 @@ foo (uint8x16_t a, uint8x16_t b, uint8x16_t c)
|
||
|
|
return vbslq_u8 (a, b, c); /* { dg-message "called from here" } */
|
||
|
|
}
|
||
|
|
|
||
|
|
-/* { dg-error "inlining failed in call to always_inline" "" { target *-*-* } 0 } */
|
||
|
|
+/* { dg-error "inlining failed in call to 'always_inline'" "" { target *-*-* } 0 } */
|
||
|
|
diff --git a/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c b/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c
|
||
|
|
index 05dc579f2..fb6e0b9cd 100644
|
||
|
|
--- a/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c
|
||
|
|
+++ b/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c
|
||
|
|
@@ -14,5 +14,5 @@ foo (uint8x16_t *p)
|
||
|
|
*p = vmovq_n_u8 (3); /* { dg-message "called from here" } */
|
||
|
|
}
|
||
|
|
|
||
|
|
-/* { dg-error "inlining failed in call to always_inline" "" { target *-*-* } 0 } */
|
||
|
|
+/* { dg-error "inlining failed in call to 'always_inline'" "" { target *-*-* } 0 } */
|
||
|
|
|
||
|
|
diff --git a/gcc/testsuite/gcc.target/i386/pr82361-1.c b/gcc/testsuite/gcc.target/i386/pr82361-1.c
|
||
|
|
index e7c356557..dec1792ae 100644
|
||
|
|
--- a/gcc/testsuite/gcc.target/i386/pr82361-1.c
|
||
|
|
+++ b/gcc/testsuite/gcc.target/i386/pr82361-1.c
|
||
|
|
@@ -4,50 +4,50 @@
|
||
|
|
/* We should be able to optimize all %eax to %rax zero extensions, because
|
||
|
|
div and idiv instructions with 32-bit operands zero-extend both results. */
|
||
|
|
/* { dg-final { scan-assembler-not "movl\t%eax, %eax" } } */
|
||
|
|
-/* FIXME: We are still not able to optimize the modulo in f1/f2, only manage
|
||
|
|
- one. */
|
||
|
|
+/* FIXME: The compiler does not merge zero-extension to the modulo part
|
||
|
|
+ of f1 and f2. */
|
||
|
|
/* { dg-final { scan-assembler-times "movl\t%edx" 2 } } */
|
||
|
|
|
||
|
|
void
|
||
|
|
f1 (unsigned int a, unsigned int b)
|
||
|
|
{
|
||
|
|
- unsigned long long c = a / b;
|
||
|
|
- unsigned long long d = a % b;
|
||
|
|
+ register unsigned long long c asm ("rax") = a / b;
|
||
|
|
+ register unsigned long long d asm ("rdx") = a % b;
|
||
|
|
asm volatile ("" : : "r" (c), "r" (d));
|
||
|
|
}
|
||
|
|
|
||
|
|
void
|
||
|
|
f2 (int a, int b)
|
||
|
|
{
|
||
|
|
- unsigned long long c = (unsigned int) (a / b);
|
||
|
|
- unsigned long long d = (unsigned int) (a % b);
|
||
|
|
+ register unsigned long long c asm ("rax") = (unsigned int) (a / b);
|
||
|
|
+ register unsigned long long d asm ("rdx") = (unsigned int) (a % b);
|
||
|
|
asm volatile ("" : : "r" (c), "r" (d));
|
||
|
|
}
|
||
|
|
|
||
|
|
void
|
||
|
|
f3 (unsigned int a, unsigned int b)
|
||
|
|
{
|
||
|
|
- unsigned long long c = a / b;
|
||
|
|
+ register unsigned long long c asm ("rax") = a / b;
|
||
|
|
asm volatile ("" : : "r" (c));
|
||
|
|
}
|
||
|
|
|
||
|
|
void
|
||
|
|
f4 (int a, int b)
|
||
|
|
{
|
||
|
|
- unsigned long long c = (unsigned int) (a / b);
|
||
|
|
+ register unsigned long long c asm ("rax") = (unsigned int) (a / b);
|
||
|
|
asm volatile ("" : : "r" (c));
|
||
|
|
}
|
||
|
|
|
||
|
|
void
|
||
|
|
f5 (unsigned int a, unsigned int b)
|
||
|
|
{
|
||
|
|
- unsigned long long d = a % b;
|
||
|
|
+ register unsigned long long d asm ("rdx") = a % b;
|
||
|
|
asm volatile ("" : : "r" (d));
|
||
|
|
}
|
||
|
|
|
||
|
|
void
|
||
|
|
f6 (int a, int b)
|
||
|
|
{
|
||
|
|
- unsigned long long d = (unsigned int) (a % b);
|
||
|
|
+ register unsigned long long d asm ("rdx") = (unsigned int) (a % b);
|
||
|
|
asm volatile ("" : : "r" (d));
|
||
|
|
}
|
||
|
|
diff --git a/gcc/testsuite/gcc.target/i386/pr82361-2.c b/gcc/testsuite/gcc.target/i386/pr82361-2.c
|
||
|
|
index c1e484d6e..2d87de182 100644
|
||
|
|
--- a/gcc/testsuite/gcc.target/i386/pr82361-2.c
|
||
|
|
+++ b/gcc/testsuite/gcc.target/i386/pr82361-2.c
|
||
|
|
@@ -4,7 +4,8 @@
|
||
|
|
/* We should be able to optimize all %eax to %rax zero extensions, because
|
||
|
|
div and idiv instructions with 32-bit operands zero-extend both results. */
|
||
|
|
/* { dg-final { scan-assembler-not "movl\t%eax, %eax" } } */
|
||
|
|
-/* Ditto %edx to %rdx zero extensions. */
|
||
|
|
-/* { dg-final { scan-assembler-not "movl\t%edx, %edx" } } */
|
||
|
|
+/* FIXME: The compiler does not merge zero-extension to the modulo part
|
||
|
|
+ of f1 and f2. */
|
||
|
|
+/* { dg-final { scan-assembler-times "movl\t%edx" 4 } } */
|
||
|
|
|
||
|
|
#include "pr82361-1.c"
|
||
|
|
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
|
||
|
|
index 527deffe4..be47519bc 100644
|
||
|
|
--- a/gcc/tree-cfg.c
|
||
|
|
+++ b/gcc/tree-cfg.c
|
||
|
|
@@ -4297,8 +4297,17 @@ verify_gimple_assign_ternary (gassign *stmt)
|
||
|
|
}
|
||
|
|
if (! ((INTEGRAL_TYPE_P (rhs1_type)
|
||
|
|
&& INTEGRAL_TYPE_P (rhs2_type))
|
||
|
|
+ /* Vector element insert. */
|
||
|
|
|| (VECTOR_TYPE_P (rhs1_type)
|
||
|
|
- && types_compatible_p (TREE_TYPE (rhs1_type), rhs2_type))))
|
||
|
|
+ && types_compatible_p (TREE_TYPE (rhs1_type), rhs2_type))
|
||
|
|
+ /* Aligned sub-vector insert. */
|
||
|
|
+ || (VECTOR_TYPE_P (rhs1_type)
|
||
|
|
+ && VECTOR_TYPE_P (rhs2_type)
|
||
|
|
+ && types_compatible_p (TREE_TYPE (rhs1_type),
|
||
|
|
+ TREE_TYPE (rhs2_type))
|
||
|
|
+ && multiple_p (TYPE_VECTOR_SUBPARTS (rhs1_type),
|
||
|
|
+ TYPE_VECTOR_SUBPARTS (rhs2_type))
|
||
|
|
+ && multiple_of_p (bitsizetype, rhs3, TYPE_SIZE (rhs2_type)))))
|
||
|
|
{
|
||
|
|
error ("not allowed type combination in BIT_INSERT_EXPR");
|
||
|
|
debug_generic_expr (rhs1_type);
|
||
|
|
diff --git a/gcc/tree-ssa.c b/gcc/tree-ssa.c
|
||
|
|
index 1dc544b6d..a149f5e79 100644
|
||
|
|
--- a/gcc/tree-ssa.c
|
||
|
|
+++ b/gcc/tree-ssa.c
|
||
|
|
@@ -1522,8 +1522,6 @@ non_rewritable_lvalue_p (tree lhs)
|
||
|
|
if (DECL_P (decl)
|
||
|
|
&& VECTOR_TYPE_P (TREE_TYPE (decl))
|
||
|
|
&& TYPE_MODE (TREE_TYPE (decl)) != BLKmode
|
||
|
|
- && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)),
|
||
|
|
- TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (decl))), 0)
|
||
|
|
&& known_ge (mem_ref_offset (lhs), 0)
|
||
|
|
&& known_gt (wi::to_poly_offset (TYPE_SIZE_UNIT (TREE_TYPE (decl))),
|
||
|
|
mem_ref_offset (lhs))
|
||
|
|
@@ -1531,7 +1529,24 @@ non_rewritable_lvalue_p (tree lhs)
|
||
|
|
TYPE_SIZE_UNIT (TREE_TYPE (lhs)))
|
||
|
|
&& known_ge (wi::to_poly_offset (TYPE_SIZE (TREE_TYPE (decl))),
|
||
|
|
wi::to_poly_offset (TYPE_SIZE (TREE_TYPE (lhs)))))
|
||
|
|
- return false;
|
||
|
|
+ {
|
||
|
|
+ poly_uint64 lhs_bits, nelts;
|
||
|
|
+ if (poly_int_tree_p (TYPE_SIZE (TREE_TYPE (lhs)), &lhs_bits)
|
||
|
|
+ && multiple_p (lhs_bits,
|
||
|
|
+ tree_to_uhwi
|
||
|
|
+ (TYPE_SIZE (TREE_TYPE (TREE_TYPE (decl)))),
|
||
|
|
+ &nelts))
|
||
|
|
+ {
|
||
|
|
+ if (known_eq (nelts, 1u))
|
||
|
|
+ return false;
|
||
|
|
+ /* For sub-vector inserts the insert vector mode has to be
|
||
|
|
+ supported. */
|
||
|
|
+ tree vtype = build_vector_type (TREE_TYPE (TREE_TYPE (decl)),
|
||
|
|
+ nelts);
|
||
|
|
+ if (TYPE_MODE (vtype) != BLKmode)
|
||
|
|
+ return false;
|
||
|
|
+ }
|
||
|
|
+ }
|
||
|
|
}
|
||
|
|
|
||
|
|
/* A vector-insert using a BIT_FIELD_REF is rewritable using
|
||
|
|
@@ -1869,20 +1884,30 @@ execute_update_addresses_taken (void)
|
||
|
|
&& bitmap_bit_p (suitable_for_renaming, DECL_UID (sym))
|
||
|
|
&& VECTOR_TYPE_P (TREE_TYPE (sym))
|
||
|
|
&& TYPE_MODE (TREE_TYPE (sym)) != BLKmode
|
||
|
|
- && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)),
|
||
|
|
- TYPE_SIZE_UNIT
|
||
|
|
- (TREE_TYPE (TREE_TYPE (sym))), 0)
|
||
|
|
- && tree_fits_uhwi_p (TREE_OPERAND (lhs, 1))
|
||
|
|
- && tree_int_cst_lt (TREE_OPERAND (lhs, 1),
|
||
|
|
- TYPE_SIZE_UNIT (TREE_TYPE (sym)))
|
||
|
|
- && (tree_to_uhwi (TREE_OPERAND (lhs, 1))
|
||
|
|
- % tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (lhs)))) == 0)
|
||
|
|
+ && known_ge (mem_ref_offset (lhs), 0)
|
||
|
|
+ && known_gt (wi::to_poly_offset
|
||
|
|
+ (TYPE_SIZE_UNIT (TREE_TYPE (sym))),
|
||
|
|
+ mem_ref_offset (lhs))
|
||
|
|
+ && multiple_of_p (sizetype,
|
||
|
|
+ TREE_OPERAND (lhs, 1),
|
||
|
|
+ TYPE_SIZE_UNIT (TREE_TYPE (lhs))))
|
||
|
|
{
|
||
|
|
tree val = gimple_assign_rhs1 (stmt);
|
||
|
|
if (! types_compatible_p (TREE_TYPE (val),
|
||
|
|
TREE_TYPE (TREE_TYPE (sym))))
|
||
|
|
{
|
||
|
|
- tree tem = make_ssa_name (TREE_TYPE (TREE_TYPE (sym)));
|
||
|
|
+ poly_uint64 lhs_bits, nelts;
|
||
|
|
+ tree temtype = TREE_TYPE (TREE_TYPE (sym));
|
||
|
|
+ if (poly_int_tree_p (TYPE_SIZE (TREE_TYPE (lhs)),
|
||
|
|
+ &lhs_bits)
|
||
|
|
+ && multiple_p (lhs_bits,
|
||
|
|
+ tree_to_uhwi
|
||
|
|
+ (TYPE_SIZE (TREE_TYPE
|
||
|
|
+ (TREE_TYPE (sym)))),
|
||
|
|
+ &nelts)
|
||
|
|
+ && maybe_ne (nelts, 1u))
|
||
|
|
+ temtype = build_vector_type (temtype, nelts);
|
||
|
|
+ tree tem = make_ssa_name (temtype);
|
||
|
|
gimple *pun
|
||
|
|
= gimple_build_assign (tem,
|
||
|
|
build1 (VIEW_CONVERT_EXPR,
|