This backport contains 2 patchs from gcc main stream tree. The commit id of these patchs list as following in the order of time. 97b798d80baf945ea28236eef3fa69f36626b579 0001-SLP-VECT-Add-check-to-fix-96837.patch 373b99dc40949efa697326f378e5022a02e0328b 0002-Add-a-testcase-for-PR-target-96827.patch diff -uprN a/gcc/testsuite/gcc.dg/vect/bb-slp-49.c b/gcc/testsuite/gcc.dg/vect/bb-slp-49.c --- a/gcc/testsuite/gcc.dg/vect/bb-slp-49.c 1970-01-01 08:00:00.000000000 +0800 +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-49.c 2020-11-17 15:58:12.118126065 +0800 @@ -0,0 +1,28 @@ +/* This checks that vectorized constructors have the correct ordering. */ +/* { dg-require-effective-target vect_int } */ + +typedef int V __attribute__((__vector_size__(16))); + +__attribute__((__noipa__)) void +foo (unsigned int x, V *y) +{ + unsigned int a[4] = { x + 0, x + 2, x + 4, x + 6 }; + for (unsigned int i = 0; i < 3; ++i) + if (a[i] == 1234) + a[i]--; + *y = (V) { a[3], a[2], a[1], a[0] }; +} + +int +main () +{ + V b; + foo (0, &b); + if (b[0] != 6 || b[1] != 4 || b[2] != 2 || b[3] != 0) + __builtin_abort (); + return 0; +} + +/* See that we vectorize an SLP instance. */ +/* { dg-final { scan-tree-dump "Analyzing vectorizable constructor" "slp1" } } */ +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "slp1" } } */ diff -uprN a/gcc/testsuite/gcc.target/i386/pr96827.c b/gcc/testsuite/gcc.target/i386/pr96827.c --- a/gcc/testsuite/gcc.target/i386/pr96827.c 1970-01-01 08:00:00.000000000 +0800 +++ b/gcc/testsuite/gcc.target/i386/pr96827.c 2020-11-17 15:58:15.182126065 +0800 @@ -0,0 +1,41 @@ +/* { dg-do run { target sse2_runtime } } */ +/* { dg-options "-O3 -msse2 -mfpmath=sse" } */ + +typedef unsigned short int __uint16_t; +typedef unsigned int __uint32_t; +typedef __uint16_t uint16_t; +typedef __uint32_t uint32_t; +typedef int __v4si __attribute__ ((__vector_size__ (16))); +typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_si128 (__m128i *__P, __m128i __B) +{ + *__P = __B; +} +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0) +{ + return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 }; +} +typedef uint16_t u16; +typedef uint32_t u32; +extern int printf (const char *__restrict __format, ...); +void do_the_thing(u32 idx, __m128i *dude) +{ + u32 dude_[4] = { idx+0, idx+2, idx+4, idx+6 }; + for (u32 i = 0; i < 3; ++i) + if (dude_[i] == 1234) + dude_[i]--; + *dude = _mm_set_epi32(dude_[0], dude_[1], dude_[2], dude_[3]); +} +int main() +{ + __m128i dude; + u32 idx = 0; + do_the_thing(idx, &dude); + __attribute__((aligned(16))) u32 dude_[4]; + _mm_store_si128((__m128i*)dude_, dude); + if (!(6 == dude_[0] && 4 == dude_[1] && 2 == dude_[2] && 0 == dude_[3])) + __builtin_abort (); + return 0; +} diff -uprN a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c --- a/gcc/tree-vect-slp.c 2020-11-17 15:55:57.098126065 +0800 +++ b/gcc/tree-vect-slp.c 2020-11-17 15:59:25.862126065 +0800 @@ -1842,7 +1842,8 @@ vect_supported_load_permutation_p (slp_i /* Reduction (there are no data-refs in the root). In reduction chain the order of the loads is not important. */ if (!STMT_VINFO_DATA_REF (stmt_info) - && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)) + && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) + && !SLP_INSTANCE_ROOT_STMT (slp_instn)) vect_attempt_slp_rearrange_stmts (slp_instn); /* In basic block vectorization we allow any subchain of an interleaving