This backport contains 2 patchs from gcc main stream tree. The commit id of these patchs list as following in the order of time. 0001-AArch64-Improve-SVE-constant-moves.patch 4aeb1ba7f62c1d680c819ae3e137c3bad6f520ca 0002-aarch64-Add-vector-vector-vec_extract-patterns-PR928.patch c15893df6eafc32efd6184379dd7f02c36da7d12 diff -Nurp a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c --- a/gcc/config/aarch64/aarch64.c 2020-09-03 19:50:00.484000000 +0800 +++ b/gcc/config/aarch64/aarch64.c 2020-09-03 19:50:19.336943210 +0800 @@ -3632,7 +3632,7 @@ aarch64_maybe_expand_sve_subreg_move (rt attributes. Unlike gen_lowpart, this doesn't care whether the mode change is valid. */ -static rtx +rtx aarch64_replace_reg_mode (rtx x, machine_mode mode) { if (GET_MODE (x) == mode) @@ -15016,6 +15016,36 @@ aarch64_simd_check_vect_par_cnst_half (r return true; } +/* Return a PARALLEL containing NELTS elements, with element I equal + to BASE + I * STEP. */ + +rtx +aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step) +{ + rtvec vec = rtvec_alloc (nelts); + for (unsigned int i = 0; i < nelts; ++i) + RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode); + return gen_rtx_PARALLEL (VOIDmode, vec); +} + +/* Return true if OP is a PARALLEL of CONST_INTs that form a linear + series with step STEP. */ + +bool +aarch64_stepped_int_parallel_p (rtx op, int step) +{ + if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0))) + return false; + + unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0)); + for (int i = 1; i < XVECLEN (op, 0); ++i) + if (!CONST_INT_P (XVECEXP (op, 0, i)) + || UINTVAL (XVECEXP (op, 0, i)) != base + i * step) + return false; + + return true; +} + /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and HIGH (exclusive). */ void diff -Nurp a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h --- a/gcc/config/aarch64/aarch64-protos.h 2020-09-03 19:50:00.484000000 +0800 +++ b/gcc/config/aarch64/aarch64-protos.h 2020-09-03 19:50:29.137683100 +0800 @@ -501,6 +501,8 @@ bool aarch64_sve_ld1r_operand_p (rtx); bool aarch64_sve_ldr_operand_p (rtx); bool aarch64_sve_struct_memory_operand_p (rtx); rtx aarch64_simd_vect_par_cnst_half (machine_mode, int, bool); +rtx aarch64_gen_stepped_int_parallel (unsigned int, int, int); +bool aarch64_stepped_int_parallel_p (rtx, int); rtx aarch64_tls_get_addr (void); tree aarch64_fold_builtin (tree, int, tree *, bool); unsigned aarch64_dbx_register_number (unsigned); @@ -516,6 +518,7 @@ void aarch64_expand_mov_immediate (rtx, void aarch64_emit_sve_pred_move (rtx, rtx, rtx); void aarch64_expand_sve_mem_move (rtx, rtx, machine_mode); bool aarch64_maybe_expand_sve_subreg_move (rtx, rtx); +rtx aarch64_replace_reg_mode (rtx, machine_mode); void aarch64_split_sve_subreg_move (rtx, rtx, rtx); void aarch64_expand_prologue (void); void aarch64_expand_vector_init (rtx, rtx); diff -Nurp a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md --- a/gcc/config/aarch64/aarch64-simd.md 2020-09-03 19:50:00.484000000 +0800 +++ b/gcc/config/aarch64/aarch64-simd.md 2020-09-03 19:50:44.100673150 +0800 @@ -282,37 +282,51 @@ rtx dst_high_part = gen_highpart (mode, dst); rtx lo = aarch64_simd_vect_par_cnst_half (mode, , false); rtx hi = aarch64_simd_vect_par_cnst_half (mode, , true); - - emit_insn - (gen_aarch64_simd_mov_from_low (dst_low_part, src, lo)); - emit_insn - (gen_aarch64_simd_mov_from_high (dst_high_part, src, hi)); + emit_insn (gen_aarch64_get_half (dst_low_part, src, lo)); + emit_insn (gen_aarch64_get_half (dst_high_part, src, hi)); } DONE; } ) -(define_insn "aarch64_simd_mov_from_low" - [(set (match_operand: 0 "register_operand" "=r") +(define_expand "aarch64_get_half" + [(set (match_operand: 0 "register_operand") (vec_select: - (match_operand:VQ 1 "register_operand" "w") - (match_operand:VQ 2 "vect_par_cnst_lo_half" "")))] - "TARGET_SIMD && reload_completed" - "umov\t%0, %1.d[0]" - [(set_attr "type" "neon_to_gp") - (set_attr "length" "4") - ]) + (match_operand:VQ 1 "register_operand") + (match_operand 2 "ascending_int_parallel")))] + "TARGET_SIMD" +) + +(define_insn_and_split "aarch64_simd_mov_from_low" + [(set (match_operand: 0 "register_operand" "=w,?r") + (vec_select: + (match_operand:VQ_NO2E 1 "register_operand" "w,w") + (match_operand:VQ_NO2E 2 "vect_par_cnst_lo_half" "")))] + "TARGET_SIMD" + "@ + # + umov\t%0, %1.d[0]" + "&& reload_completed && aarch64_simd_register (operands[0], mode)" + [(set (match_dup 0) (match_dup 1))] + { + operands[1] = aarch64_replace_reg_mode (operands[1], mode); + } + [(set_attr "type" "mov_reg,neon_to_gp") + (set_attr "length" "4")] +) (define_insn "aarch64_simd_mov_from_high" - [(set (match_operand: 0 "register_operand" "=r") + [(set (match_operand: 0 "register_operand" "=w,?r") (vec_select: - (match_operand:VQ 1 "register_operand" "w") - (match_operand:VQ 2 "vect_par_cnst_hi_half" "")))] - "TARGET_SIMD && reload_completed" - "umov\t%0, %1.d[1]" - [(set_attr "type" "neon_to_gp") - (set_attr "length" "4") - ]) + (match_operand:VQ_NO2E 1 "register_operand" "w,w") + (match_operand:VQ_NO2E 2 "vect_par_cnst_hi_half" "")))] + "TARGET_SIMD" + "@ + dup\\t%d0, %1.d[1] + umov\t%0, %1.d[1]" + [(set_attr "type" "neon_dup,neon_to_gp") + (set_attr "length" "4")] +) (define_insn "orn3" [(set (match_operand:VDQ_I 0 "register_operand" "=w") @@ -6016,6 +6030,35 @@ DONE; }) +;; Extract a 64-bit vector from one half of a 128-bit vector. +(define_expand "vec_extract" + [(match_operand: 0 "register_operand") + (match_operand:VQ_NO2E 1 "register_operand") + (match_operand 2 "immediate_operand")] + "TARGET_SIMD" +{ + int start = INTVAL (operands[2]); + if (start != 0 && start != / 2) + FAIL; + rtx sel = aarch64_gen_stepped_int_parallel ( / 2, start, 1); + emit_insn (gen_aarch64_get_half (operands[0], operands[1], sel)); + DONE; +}) + +;; Extract a single-element 64-bit vector from one half of a 128-bit vector. +(define_expand "vec_extractv2dfv1df" + [(match_operand:V1DF 0 "register_operand") + (match_operand:V2DF 1 "register_operand") + (match_operand 2 "immediate_operand")] + "TARGET_SIMD" +{ + /* V1DF is rarely used by other patterns, so it should be better to hide + it in a subreg destination of a normal DF op. */ + rtx scalar0 = gen_lowpart (DFmode, operands[0]); + emit_insn (gen_vec_extractv2dfdf (scalar0, operands[1], operands[2])); + DONE; +}) + ;; aes (define_insn "aarch64_crypto_aesv16qi" diff -Nurp a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md --- a/gcc/config/aarch64/predicates.md 2020-09-03 19:50:00.484000000 +0800 +++ b/gcc/config/aarch64/predicates.md 2020-09-03 19:50:49.315344350 +0800 @@ -438,6 +438,12 @@ return aarch64_simd_check_vect_par_cnst_half (op, mode, false); }) +(define_predicate "ascending_int_parallel" + (match_code "parallel") +{ + return aarch64_stepped_int_parallel_p (op, 1); +}) + (define_special_predicate "aarch64_simd_lshift_imm" (match_code "const,const_vector") {