gcc/x86-Fix-bf16-and-matrix.patch
eastb233 01e0ec8ea6 Upload GCC feature and bugfix patches.
- avoid-cycling-on-vertain-subreg-reloads.patch: Add patch source comment
- change-gcc-BASE-VER.patch: Likewise
- dont-generate-IF_THEN_ELSE.patch: Likewise
- fix-ICE-in-compute_live_loop_exits.patch: Likewise
- fix-ICE-in-eliminate_stmt.patch: Likewise
- fix-ICE-in-vect_create_epilog_for_reduction.patch: Likewise
- fix-ICE-in-vect_stmt_to_vectorize.patch: Likewise
- fix-ICE-in-verify_ssa.patch: Likewise
- fix-ICE-when-vectorizing-nested-cycles.patch: Likewise
- fix-cost-of-plus.patch: Likewise
- ipa-const-prop-self-recursion-bugfix.patch: Likewise
- simplify-removing-subregs.patch: Likewise
- medium-code-mode.patch: Bugfix
- fix-when-peeling-for-alignment.patch: Move to ...
- fix-PR-92351-When-peeling-for-alignment.patch: ... this
- AArch64-Fix-constraints-for-CPY-M.patch: New file
- Apply-maximum-nunits-for-BB-SLP.patch: New file
- Fix-EXTRACT_LAST_REDUCTION-segfault.patch: New file
- Fix-up-push_partial_def-little-endian-bitfield.patch: New file
- Fix-zero-masking-for-vcvtps2ph.patch: New file
- IRA-Handle-fully-tied-destinations.patch: New file
- SLP-VECT-Add-check-to-fix-96837.patch: New file
- aarch64-Fix-ash-lr-lshr-mode-3-expanders.patch: New file
- aarch64-Fix-bf16-and-matrix-g++-gfortran.patch: New file
- aarch64-Fix-mismatched-SVE-predicate-modes.patch: New file
- aarch64-fix-sve-acle-error.patch: New file
- adjust-vector-cost-and-move-EXTRACT_LAST_REDUCTION-costing.patch: New file
- bf16-and-matrix-characteristic.patch: New file
- fix-ICE-IPA-compare-VRP-types.patch: New file
- fix-ICE-in-affine-combination.patch: New file
- fix-ICE-in-pass-vect.patch: New file
- fix-ICE-in-vect_update_misalignment_for_peel.patch: New file
- fix-addlosymdi-ICE-in-pass-reload.patch: New file
- fix-an-ICE-in-vect_recog_mask_conversion_pattern.patch: New file
- fix-avx512vl-vcvttpd2dq-2-fail.patch: New file
- fix-issue499-add-nop-convert.patch: New file
- fix-issue604-ldist-dependency-fixup.patch: New file
- modulo-sched-Carefully-process-loop-counter-initiali.patch: New file
- re-PR-target-91124-gcc.target-i386-avx512vl-vpshldvd.patch: New file
- reduction-paths-with-unhandled-live-stmt.patch: New file
- redundant-loop-elimination.patch: New file
- sccvn-Improve-handling-of-load-masked-with-integer.patch: New file
- speed-up-DDG-analysis-and-fix-bootstrap-compare-debug.patch: New file
- store-merging-Consider-also-overlapping-stores-earlier.patch: New file
- tree-optimization-96920-another-ICE-when-vectorizing.patch: New file
- tree-optimization-97812-fix-range-query-in-VRP-asser.patch: New file
- vectorizable-comparison-Swap-operands-only-once.patch: New file
- x86-Fix-bf16-and-matrix.patch: New file
2020-12-30 09:54:10 +08:00

322 lines
12 KiB
Diff

This backport contains 4 patchs from gcc main stream tree.
The commit id of these patchs list as following in the order of time.
0001-re-PR-target-90424-memcpy-into-vector-builtin-not-op.patch
1bf2a0b90f2457f6d9301535560eb5e05978261b
0002-testsuite-aarch64-arm-Add-missing-quotes-to-expected.patch
0ec537f3500924f29505977aa89c2a1d4671c584
0003-x86-Tweak-testcases-for-PR82361.patch
ad4644f378fe2f731cd987a4aff14b935f530b88
0004-x86-Robustify-vzeroupper-handling-across-calls.patch
2a2e3a0dfcbe0861915f421d11b828f0c35023f0
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 9282a8fb6..ba72da1ec 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -95,6 +95,7 @@ along with GCC; see the file COPYING3. If not see
#include "i386-builtins.h"
#include "i386-expand.h"
#include "i386-features.h"
+#include "function-abi.h"
/* This file should be included last. */
#include "target-def.h"
@@ -13529,6 +13530,15 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
}
}
+ /* If the function is known to preserve some SSE registers,
+ RA and previous passes can legitimately rely on that for
+ modes wider than 256 bits. It's only safe to issue a
+ vzeroupper if all SSE registers are clobbered. */
+ const function_abi &abi = insn_callee_abi (insn);
+ if (!hard_reg_set_subset_p (reg_class_contents[ALL_SSE_REGS],
+ abi.mode_clobbers (V4DImode)))
+ return AVX_U128_ANY;
+
return AVX_U128_CLEAN;
}
diff --git a/gcc/testsuite/g++.target/i386/pr90424-1.C b/gcc/testsuite/g++.target/i386/pr90424-1.C
new file mode 100644
index 000000000..9df8c089b
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr90424-1.C
@@ -0,0 +1,32 @@
+/* { dg-do compile { target c++11 } } */
+/* { dg-options "-O2 -msse2 -fdump-tree-optimized" } */
+
+template <class T>
+using V [[gnu::vector_size(16)]] = T;
+
+template <class T, unsigned M = sizeof(V<T>)>
+V<T> load(const void *p) {
+ using W = V<T>;
+ W r;
+ __builtin_memcpy(&r, p, M);
+ return r;
+}
+
+// movq or movsd
+template V<char> load<char, 8>(const void *); // bad
+template V<short> load<short, 8>(const void *); // bad
+template V<int> load<int, 8>(const void *); // bad
+template V<long> load<long, 8>(const void *); // good
+// the following is disabled because V2SF isn't a supported mode
+// template V<float> load<float, 8>(const void *); // bad
+template V<double> load<double, 8>(const void *); // good (movsd?)
+
+// movd or movss
+template V<char> load<char, 4>(const void *); // bad
+template V<short> load<short, 4>(const void *); // bad
+template V<int> load<int, 4>(const void *); // good
+template V<float> load<float, 4>(const void *); // good
+
+/* We should end up with one load and one insert for each function. */
+/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 9 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "MEM" 9 "optimized" } } */
diff --git a/gcc/testsuite/g++.target/i386/pr90424-2.C b/gcc/testsuite/g++.target/i386/pr90424-2.C
new file mode 100644
index 000000000..3abb65f45
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr90424-2.C
@@ -0,0 +1,31 @@
+/* { dg-do compile { target c++11 } } */
+/* { dg-options "-O2 -msse2 -fdump-tree-optimized" } */
+
+template <class T>
+using V [[gnu::vector_size(16)]] = T;
+
+template <class T, unsigned M = sizeof(V<T>)>
+V<T> load(const void *p) {
+ V<T> r = {};
+ __builtin_memcpy(&r, p, M);
+ return r;
+}
+
+// movq or movsd
+template V<char> load<char, 8>(const void *); // bad
+template V<short> load<short, 8>(const void *); // bad
+template V<int> load<int, 8>(const void *); // bad
+template V<long> load<long, 8>(const void *); // good
+// the following is disabled because V2SF isn't a supported mode
+// template V<float> load<float, 8>(const void *); // bad
+template V<double> load<double, 8>(const void *); // good (movsd?)
+
+// movd or movss
+template V<char> load<char, 4>(const void *); // bad
+template V<short> load<short, 4>(const void *); // bad
+template V<int> load<int, 4>(const void *); // good
+template V<float> load<float, 4>(const void *); // good
+
+/* We should end up with one load and one insert for each function. */
+/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 9 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "MEM" 9 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/target_attr_10.c b/gcc/testsuite/gcc.target/aarch64/target_attr_10.c
index 184990471..d96a8733a 100644
--- a/gcc/testsuite/gcc.target/aarch64/target_attr_10.c
+++ b/gcc/testsuite/gcc.target/aarch64/target_attr_10.c
@@ -13,4 +13,4 @@ foo (uint8x16_t a, uint8x16_t b, uint8x16_t c)
return vbslq_u8 (a, b, c); /* { dg-message "called from here" } */
}
-/* { dg-error "inlining failed in call to always_inline" "" { target *-*-* } 0 } */
+/* { dg-error "inlining failed in call to 'always_inline'" "" { target *-*-* } 0 } */
diff --git a/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c b/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c
index 05dc579f2..fb6e0b9cd 100644
--- a/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c
+++ b/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c
@@ -14,5 +14,5 @@ foo (uint8x16_t *p)
*p = vmovq_n_u8 (3); /* { dg-message "called from here" } */
}
-/* { dg-error "inlining failed in call to always_inline" "" { target *-*-* } 0 } */
+/* { dg-error "inlining failed in call to 'always_inline'" "" { target *-*-* } 0 } */
diff --git a/gcc/testsuite/gcc.target/i386/pr82361-1.c b/gcc/testsuite/gcc.target/i386/pr82361-1.c
index e7c356557..dec1792ae 100644
--- a/gcc/testsuite/gcc.target/i386/pr82361-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr82361-1.c
@@ -4,50 +4,50 @@
/* We should be able to optimize all %eax to %rax zero extensions, because
div and idiv instructions with 32-bit operands zero-extend both results. */
/* { dg-final { scan-assembler-not "movl\t%eax, %eax" } } */
-/* FIXME: We are still not able to optimize the modulo in f1/f2, only manage
- one. */
+/* FIXME: The compiler does not merge zero-extension to the modulo part
+ of f1 and f2. */
/* { dg-final { scan-assembler-times "movl\t%edx" 2 } } */
void
f1 (unsigned int a, unsigned int b)
{
- unsigned long long c = a / b;
- unsigned long long d = a % b;
+ register unsigned long long c asm ("rax") = a / b;
+ register unsigned long long d asm ("rdx") = a % b;
asm volatile ("" : : "r" (c), "r" (d));
}
void
f2 (int a, int b)
{
- unsigned long long c = (unsigned int) (a / b);
- unsigned long long d = (unsigned int) (a % b);
+ register unsigned long long c asm ("rax") = (unsigned int) (a / b);
+ register unsigned long long d asm ("rdx") = (unsigned int) (a % b);
asm volatile ("" : : "r" (c), "r" (d));
}
void
f3 (unsigned int a, unsigned int b)
{
- unsigned long long c = a / b;
+ register unsigned long long c asm ("rax") = a / b;
asm volatile ("" : : "r" (c));
}
void
f4 (int a, int b)
{
- unsigned long long c = (unsigned int) (a / b);
+ register unsigned long long c asm ("rax") = (unsigned int) (a / b);
asm volatile ("" : : "r" (c));
}
void
f5 (unsigned int a, unsigned int b)
{
- unsigned long long d = a % b;
+ register unsigned long long d asm ("rdx") = a % b;
asm volatile ("" : : "r" (d));
}
void
f6 (int a, int b)
{
- unsigned long long d = (unsigned int) (a % b);
+ register unsigned long long d asm ("rdx") = (unsigned int) (a % b);
asm volatile ("" : : "r" (d));
}
diff --git a/gcc/testsuite/gcc.target/i386/pr82361-2.c b/gcc/testsuite/gcc.target/i386/pr82361-2.c
index c1e484d6e..2d87de182 100644
--- a/gcc/testsuite/gcc.target/i386/pr82361-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr82361-2.c
@@ -4,7 +4,8 @@
/* We should be able to optimize all %eax to %rax zero extensions, because
div and idiv instructions with 32-bit operands zero-extend both results. */
/* { dg-final { scan-assembler-not "movl\t%eax, %eax" } } */
-/* Ditto %edx to %rdx zero extensions. */
-/* { dg-final { scan-assembler-not "movl\t%edx, %edx" } } */
+/* FIXME: The compiler does not merge zero-extension to the modulo part
+ of f1 and f2. */
+/* { dg-final { scan-assembler-times "movl\t%edx" 4 } } */
#include "pr82361-1.c"
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index 527deffe4..be47519bc 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -4297,8 +4297,17 @@ verify_gimple_assign_ternary (gassign *stmt)
}
if (! ((INTEGRAL_TYPE_P (rhs1_type)
&& INTEGRAL_TYPE_P (rhs2_type))
+ /* Vector element insert. */
|| (VECTOR_TYPE_P (rhs1_type)
- && types_compatible_p (TREE_TYPE (rhs1_type), rhs2_type))))
+ && types_compatible_p (TREE_TYPE (rhs1_type), rhs2_type))
+ /* Aligned sub-vector insert. */
+ || (VECTOR_TYPE_P (rhs1_type)
+ && VECTOR_TYPE_P (rhs2_type)
+ && types_compatible_p (TREE_TYPE (rhs1_type),
+ TREE_TYPE (rhs2_type))
+ && multiple_p (TYPE_VECTOR_SUBPARTS (rhs1_type),
+ TYPE_VECTOR_SUBPARTS (rhs2_type))
+ && multiple_of_p (bitsizetype, rhs3, TYPE_SIZE (rhs2_type)))))
{
error ("not allowed type combination in BIT_INSERT_EXPR");
debug_generic_expr (rhs1_type);
diff --git a/gcc/tree-ssa.c b/gcc/tree-ssa.c
index 1dc544b6d..a149f5e79 100644
--- a/gcc/tree-ssa.c
+++ b/gcc/tree-ssa.c
@@ -1522,8 +1522,6 @@ non_rewritable_lvalue_p (tree lhs)
if (DECL_P (decl)
&& VECTOR_TYPE_P (TREE_TYPE (decl))
&& TYPE_MODE (TREE_TYPE (decl)) != BLKmode
- && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)),
- TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (decl))), 0)
&& known_ge (mem_ref_offset (lhs), 0)
&& known_gt (wi::to_poly_offset (TYPE_SIZE_UNIT (TREE_TYPE (decl))),
mem_ref_offset (lhs))
@@ -1531,7 +1529,24 @@ non_rewritable_lvalue_p (tree lhs)
TYPE_SIZE_UNIT (TREE_TYPE (lhs)))
&& known_ge (wi::to_poly_offset (TYPE_SIZE (TREE_TYPE (decl))),
wi::to_poly_offset (TYPE_SIZE (TREE_TYPE (lhs)))))
- return false;
+ {
+ poly_uint64 lhs_bits, nelts;
+ if (poly_int_tree_p (TYPE_SIZE (TREE_TYPE (lhs)), &lhs_bits)
+ && multiple_p (lhs_bits,
+ tree_to_uhwi
+ (TYPE_SIZE (TREE_TYPE (TREE_TYPE (decl)))),
+ &nelts))
+ {
+ if (known_eq (nelts, 1u))
+ return false;
+ /* For sub-vector inserts the insert vector mode has to be
+ supported. */
+ tree vtype = build_vector_type (TREE_TYPE (TREE_TYPE (decl)),
+ nelts);
+ if (TYPE_MODE (vtype) != BLKmode)
+ return false;
+ }
+ }
}
/* A vector-insert using a BIT_FIELD_REF is rewritable using
@@ -1869,20 +1884,30 @@ execute_update_addresses_taken (void)
&& bitmap_bit_p (suitable_for_renaming, DECL_UID (sym))
&& VECTOR_TYPE_P (TREE_TYPE (sym))
&& TYPE_MODE (TREE_TYPE (sym)) != BLKmode
- && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)),
- TYPE_SIZE_UNIT
- (TREE_TYPE (TREE_TYPE (sym))), 0)
- && tree_fits_uhwi_p (TREE_OPERAND (lhs, 1))
- && tree_int_cst_lt (TREE_OPERAND (lhs, 1),
- TYPE_SIZE_UNIT (TREE_TYPE (sym)))
- && (tree_to_uhwi (TREE_OPERAND (lhs, 1))
- % tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (lhs)))) == 0)
+ && known_ge (mem_ref_offset (lhs), 0)
+ && known_gt (wi::to_poly_offset
+ (TYPE_SIZE_UNIT (TREE_TYPE (sym))),
+ mem_ref_offset (lhs))
+ && multiple_of_p (sizetype,
+ TREE_OPERAND (lhs, 1),
+ TYPE_SIZE_UNIT (TREE_TYPE (lhs))))
{
tree val = gimple_assign_rhs1 (stmt);
if (! types_compatible_p (TREE_TYPE (val),
TREE_TYPE (TREE_TYPE (sym))))
{
- tree tem = make_ssa_name (TREE_TYPE (TREE_TYPE (sym)));
+ poly_uint64 lhs_bits, nelts;
+ tree temtype = TREE_TYPE (TREE_TYPE (sym));
+ if (poly_int_tree_p (TYPE_SIZE (TREE_TYPE (lhs)),
+ &lhs_bits)
+ && multiple_p (lhs_bits,
+ tree_to_uhwi
+ (TYPE_SIZE (TREE_TYPE
+ (TREE_TYPE (sym)))),
+ &nelts)
+ && maybe_ne (nelts, 1u))
+ temtype = build_vector_type (temtype, nelts);
+ tree tem = make_ssa_name (temtype);
gimple *pun
= gimple_build_assign (tem,
build1 (VIEW_CONVERT_EXPR,