This backport contains 4 patchs from gcc main stream tree. The commit id of these patchs list as following in the order of time. 0001-re-PR-target-90424-memcpy-into-vector-builtin-not-op.patch 1bf2a0b90f2457f6d9301535560eb5e05978261b 0002-testsuite-aarch64-arm-Add-missing-quotes-to-expected.patch 0ec537f3500924f29505977aa89c2a1d4671c584 0003-x86-Tweak-testcases-for-PR82361.patch ad4644f378fe2f731cd987a4aff14b935f530b88 0004-x86-Robustify-vzeroupper-handling-across-calls.patch 2a2e3a0dfcbe0861915f421d11b828f0c35023f0 diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 9282a8fb6..ba72da1ec 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -95,6 +95,7 @@ along with GCC; see the file COPYING3. If not see #include "i386-builtins.h" #include "i386-expand.h" #include "i386-features.h" +#include "function-abi.h" /* This file should be included last. */ #include "target-def.h" @@ -13529,6 +13530,15 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) } } + /* If the function is known to preserve some SSE registers, + RA and previous passes can legitimately rely on that for + modes wider than 256 bits. It's only safe to issue a + vzeroupper if all SSE registers are clobbered. */ + const function_abi &abi = insn_callee_abi (insn); + if (!hard_reg_set_subset_p (reg_class_contents[ALL_SSE_REGS], + abi.mode_clobbers (V4DImode))) + return AVX_U128_ANY; + return AVX_U128_CLEAN; } diff --git a/gcc/testsuite/g++.target/i386/pr90424-1.C b/gcc/testsuite/g++.target/i386/pr90424-1.C new file mode 100644 index 000000000..9df8c089b --- /dev/null +++ b/gcc/testsuite/g++.target/i386/pr90424-1.C @@ -0,0 +1,32 @@ +/* { dg-do compile { target c++11 } } */ +/* { dg-options "-O2 -msse2 -fdump-tree-optimized" } */ + +template +using V [[gnu::vector_size(16)]] = T; + +template )> +V load(const void *p) { + using W = V; + W r; + __builtin_memcpy(&r, p, M); + return r; +} + +// movq or movsd +template V load(const void *); // bad +template V load(const void *); // bad +template V load(const void *); // bad +template V load(const void *); // good +// the following is disabled because V2SF isn't a supported mode +// template V load(const void *); // bad +template V load(const void *); // good (movsd?) + +// movd or movss +template V load(const void *); // bad +template V load(const void *); // bad +template V load(const void *); // good +template V load(const void *); // good + +/* We should end up with one load and one insert for each function. */ +/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 9 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "MEM" 9 "optimized" } } */ diff --git a/gcc/testsuite/g++.target/i386/pr90424-2.C b/gcc/testsuite/g++.target/i386/pr90424-2.C new file mode 100644 index 000000000..3abb65f45 --- /dev/null +++ b/gcc/testsuite/g++.target/i386/pr90424-2.C @@ -0,0 +1,31 @@ +/* { dg-do compile { target c++11 } } */ +/* { dg-options "-O2 -msse2 -fdump-tree-optimized" } */ + +template +using V [[gnu::vector_size(16)]] = T; + +template )> +V load(const void *p) { + V r = {}; + __builtin_memcpy(&r, p, M); + return r; +} + +// movq or movsd +template V load(const void *); // bad +template V load(const void *); // bad +template V load(const void *); // bad +template V load(const void *); // good +// the following is disabled because V2SF isn't a supported mode +// template V load(const void *); // bad +template V load(const void *); // good (movsd?) + +// movd or movss +template V load(const void *); // bad +template V load(const void *); // bad +template V load(const void *); // good +template V load(const void *); // good + +/* We should end up with one load and one insert for each function. */ +/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 9 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "MEM" 9 "optimized" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/target_attr_10.c b/gcc/testsuite/gcc.target/aarch64/target_attr_10.c index 184990471..d96a8733a 100644 --- a/gcc/testsuite/gcc.target/aarch64/target_attr_10.c +++ b/gcc/testsuite/gcc.target/aarch64/target_attr_10.c @@ -13,4 +13,4 @@ foo (uint8x16_t a, uint8x16_t b, uint8x16_t c) return vbslq_u8 (a, b, c); /* { dg-message "called from here" } */ } -/* { dg-error "inlining failed in call to always_inline" "" { target *-*-* } 0 } */ +/* { dg-error "inlining failed in call to 'always_inline'" "" { target *-*-* } 0 } */ diff --git a/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c b/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c index 05dc579f2..fb6e0b9cd 100644 --- a/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c +++ b/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c @@ -14,5 +14,5 @@ foo (uint8x16_t *p) *p = vmovq_n_u8 (3); /* { dg-message "called from here" } */ } -/* { dg-error "inlining failed in call to always_inline" "" { target *-*-* } 0 } */ +/* { dg-error "inlining failed in call to 'always_inline'" "" { target *-*-* } 0 } */ diff --git a/gcc/testsuite/gcc.target/i386/pr82361-1.c b/gcc/testsuite/gcc.target/i386/pr82361-1.c index e7c356557..dec1792ae 100644 --- a/gcc/testsuite/gcc.target/i386/pr82361-1.c +++ b/gcc/testsuite/gcc.target/i386/pr82361-1.c @@ -4,50 +4,50 @@ /* We should be able to optimize all %eax to %rax zero extensions, because div and idiv instructions with 32-bit operands zero-extend both results. */ /* { dg-final { scan-assembler-not "movl\t%eax, %eax" } } */ -/* FIXME: We are still not able to optimize the modulo in f1/f2, only manage - one. */ +/* FIXME: The compiler does not merge zero-extension to the modulo part + of f1 and f2. */ /* { dg-final { scan-assembler-times "movl\t%edx" 2 } } */ void f1 (unsigned int a, unsigned int b) { - unsigned long long c = a / b; - unsigned long long d = a % b; + register unsigned long long c asm ("rax") = a / b; + register unsigned long long d asm ("rdx") = a % b; asm volatile ("" : : "r" (c), "r" (d)); } void f2 (int a, int b) { - unsigned long long c = (unsigned int) (a / b); - unsigned long long d = (unsigned int) (a % b); + register unsigned long long c asm ("rax") = (unsigned int) (a / b); + register unsigned long long d asm ("rdx") = (unsigned int) (a % b); asm volatile ("" : : "r" (c), "r" (d)); } void f3 (unsigned int a, unsigned int b) { - unsigned long long c = a / b; + register unsigned long long c asm ("rax") = a / b; asm volatile ("" : : "r" (c)); } void f4 (int a, int b) { - unsigned long long c = (unsigned int) (a / b); + register unsigned long long c asm ("rax") = (unsigned int) (a / b); asm volatile ("" : : "r" (c)); } void f5 (unsigned int a, unsigned int b) { - unsigned long long d = a % b; + register unsigned long long d asm ("rdx") = a % b; asm volatile ("" : : "r" (d)); } void f6 (int a, int b) { - unsigned long long d = (unsigned int) (a % b); + register unsigned long long d asm ("rdx") = (unsigned int) (a % b); asm volatile ("" : : "r" (d)); } diff --git a/gcc/testsuite/gcc.target/i386/pr82361-2.c b/gcc/testsuite/gcc.target/i386/pr82361-2.c index c1e484d6e..2d87de182 100644 --- a/gcc/testsuite/gcc.target/i386/pr82361-2.c +++ b/gcc/testsuite/gcc.target/i386/pr82361-2.c @@ -4,7 +4,8 @@ /* We should be able to optimize all %eax to %rax zero extensions, because div and idiv instructions with 32-bit operands zero-extend both results. */ /* { dg-final { scan-assembler-not "movl\t%eax, %eax" } } */ -/* Ditto %edx to %rdx zero extensions. */ -/* { dg-final { scan-assembler-not "movl\t%edx, %edx" } } */ +/* FIXME: The compiler does not merge zero-extension to the modulo part + of f1 and f2. */ +/* { dg-final { scan-assembler-times "movl\t%edx" 4 } } */ #include "pr82361-1.c" diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c index 527deffe4..be47519bc 100644 --- a/gcc/tree-cfg.c +++ b/gcc/tree-cfg.c @@ -4297,8 +4297,17 @@ verify_gimple_assign_ternary (gassign *stmt) } if (! ((INTEGRAL_TYPE_P (rhs1_type) && INTEGRAL_TYPE_P (rhs2_type)) + /* Vector element insert. */ || (VECTOR_TYPE_P (rhs1_type) - && types_compatible_p (TREE_TYPE (rhs1_type), rhs2_type)))) + && types_compatible_p (TREE_TYPE (rhs1_type), rhs2_type)) + /* Aligned sub-vector insert. */ + || (VECTOR_TYPE_P (rhs1_type) + && VECTOR_TYPE_P (rhs2_type) + && types_compatible_p (TREE_TYPE (rhs1_type), + TREE_TYPE (rhs2_type)) + && multiple_p (TYPE_VECTOR_SUBPARTS (rhs1_type), + TYPE_VECTOR_SUBPARTS (rhs2_type)) + && multiple_of_p (bitsizetype, rhs3, TYPE_SIZE (rhs2_type))))) { error ("not allowed type combination in BIT_INSERT_EXPR"); debug_generic_expr (rhs1_type); diff --git a/gcc/tree-ssa.c b/gcc/tree-ssa.c index 1dc544b6d..a149f5e79 100644 --- a/gcc/tree-ssa.c +++ b/gcc/tree-ssa.c @@ -1522,8 +1522,6 @@ non_rewritable_lvalue_p (tree lhs) if (DECL_P (decl) && VECTOR_TYPE_P (TREE_TYPE (decl)) && TYPE_MODE (TREE_TYPE (decl)) != BLKmode - && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)), - TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (decl))), 0) && known_ge (mem_ref_offset (lhs), 0) && known_gt (wi::to_poly_offset (TYPE_SIZE_UNIT (TREE_TYPE (decl))), mem_ref_offset (lhs)) @@ -1531,7 +1529,24 @@ non_rewritable_lvalue_p (tree lhs) TYPE_SIZE_UNIT (TREE_TYPE (lhs))) && known_ge (wi::to_poly_offset (TYPE_SIZE (TREE_TYPE (decl))), wi::to_poly_offset (TYPE_SIZE (TREE_TYPE (lhs))))) - return false; + { + poly_uint64 lhs_bits, nelts; + if (poly_int_tree_p (TYPE_SIZE (TREE_TYPE (lhs)), &lhs_bits) + && multiple_p (lhs_bits, + tree_to_uhwi + (TYPE_SIZE (TREE_TYPE (TREE_TYPE (decl)))), + &nelts)) + { + if (known_eq (nelts, 1u)) + return false; + /* For sub-vector inserts the insert vector mode has to be + supported. */ + tree vtype = build_vector_type (TREE_TYPE (TREE_TYPE (decl)), + nelts); + if (TYPE_MODE (vtype) != BLKmode) + return false; + } + } } /* A vector-insert using a BIT_FIELD_REF is rewritable using @@ -1869,20 +1884,30 @@ execute_update_addresses_taken (void) && bitmap_bit_p (suitable_for_renaming, DECL_UID (sym)) && VECTOR_TYPE_P (TREE_TYPE (sym)) && TYPE_MODE (TREE_TYPE (sym)) != BLKmode - && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)), - TYPE_SIZE_UNIT - (TREE_TYPE (TREE_TYPE (sym))), 0) - && tree_fits_uhwi_p (TREE_OPERAND (lhs, 1)) - && tree_int_cst_lt (TREE_OPERAND (lhs, 1), - TYPE_SIZE_UNIT (TREE_TYPE (sym))) - && (tree_to_uhwi (TREE_OPERAND (lhs, 1)) - % tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (lhs)))) == 0) + && known_ge (mem_ref_offset (lhs), 0) + && known_gt (wi::to_poly_offset + (TYPE_SIZE_UNIT (TREE_TYPE (sym))), + mem_ref_offset (lhs)) + && multiple_of_p (sizetype, + TREE_OPERAND (lhs, 1), + TYPE_SIZE_UNIT (TREE_TYPE (lhs)))) { tree val = gimple_assign_rhs1 (stmt); if (! types_compatible_p (TREE_TYPE (val), TREE_TYPE (TREE_TYPE (sym)))) { - tree tem = make_ssa_name (TREE_TYPE (TREE_TYPE (sym))); + poly_uint64 lhs_bits, nelts; + tree temtype = TREE_TYPE (TREE_TYPE (sym)); + if (poly_int_tree_p (TYPE_SIZE (TREE_TYPE (lhs)), + &lhs_bits) + && multiple_p (lhs_bits, + tree_to_uhwi + (TYPE_SIZE (TREE_TYPE + (TREE_TYPE (sym)))), + &nelts) + && maybe_ne (nelts, 1u)) + temtype = build_vector_type (temtype, nelts); + tree tem = make_ssa_name (temtype); gimple *pun = gimple_build_assign (tem, build1 (VIEW_CONVERT_EXPR,